diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 77caf3ef82d3b4c63fd9a218c1b362fa73706550..b555df825447a42d053d583bd11d92719e998cb4 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -486,6 +486,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/spec_store_bypass /sys/devices/system/cpu/vulnerabilities/l1tf /sys/devices/system/cpu/vulnerabilities/mds + /sys/devices/system/cpu/vulnerabilities/srbds /sys/devices/system/cpu/vulnerabilities/tsx_async_abort /sys/devices/system/cpu/vulnerabilities/itlb_multihit Date: January 2018 diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index c2a4b652bd1a26579c34dfa373e7ad2e6f33f293..ce3e05e41724ae18fb0a3b8d3ae4be79538840a2 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1170,6 +1170,13 @@ PAGE_SIZE multiple when read back. Under certain circumstances, the usage may go over the limit temporarily. + In default configuration regular 0-order allocations always + succeed unless OOM killer chooses current task as a victim. + + Some kinds of allocations don't invoke the OOM killer. + Caller could retry them differently, return into userspace + as -ENOMEM or silently ignore in cases like disk readahead. + This is the ultimate protection mechanism. As long as the high limit is used and monitored properly, this limit's utility is limited to providing the final safety net. @@ -1226,17 +1233,9 @@ PAGE_SIZE multiple when read back. The number of time the cgroup's memory usage was reached the limit and allocation was about to fail. - Depending on context result could be invocation of OOM - killer and retrying allocation or failing allocation. - - Failed allocation in its turn could be returned into - userspace as -ENOMEM or silently ignored in cases like - disk readahead. For now OOM in memory cgroup kills - tasks iff shortage has happened inside page fault. - This event is not raised if the OOM killer is not considered as an option, e.g. for failed high-order - allocations. + allocations or if caller asked to not retry attempts. oom_kill The number of processes belonging to this cgroup diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst index 0dc2eb8e44e5fa56ad19cb1191f182090ee614f8..1012bd9305e905f9b75cc0539fae1f9fe0531f60 100644 --- a/Documentation/admin-guide/dynamic-debug-howto.rst +++ b/Documentation/admin-guide/dynamic-debug-howto.rst @@ -13,6 +13,11 @@ kernel code to obtain additional kernel information. Currently, if ``print_hex_dump_debug()``/``print_hex_dump_bytes()`` calls can be dynamically enabled per-callsite. +If you do not want to enable dynamic debug globally (i.e. in some embedded +system), you may set ``CONFIG_DYNAMIC_DEBUG_CORE`` as basic support of dynamic +debug and add ``ccflags := -DDYNAMIC_DEBUG_MODULE`` into the Makefile of any +modules which you'd like to dynamically debug later. + If ``CONFIG_DYNAMIC_DEBUG`` is not set, ``print_hex_dump_debug()`` is just shortcut for ``print_hex_dump(KERN_DEBUG)``. diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index 0795e3c2643f2946c3f1a58418bc6d28c376ec06..ca4dbdd9016d5a873b11381dbe75bdff5ee13038 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -14,3 +14,4 @@ are configurable at compile, boot or run time. mds tsx_async_abort multihit.rst + special-register-buffer-data-sampling.rst diff --git a/Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst b/Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst new file mode 100644 index 0000000000000000000000000000000000000000..47b1b3afac994beb278d4923e262a1da7bad5a23 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst @@ -0,0 +1,149 @@ +.. SPDX-License-Identifier: GPL-2.0 + +SRBDS - Special Register Buffer Data Sampling +============================================= + +SRBDS is a hardware vulnerability that allows MDS :doc:`mds` techniques to +infer values returned from special register accesses. Special register +accesses are accesses to off core registers. According to Intel's evaluation, +the special register reads that have a security expectation of privacy are +RDRAND, RDSEED and SGX EGETKEY. + +When RDRAND, RDSEED and EGETKEY instructions are used, the data is moved +to the core through the special register mechanism that is susceptible +to MDS attacks. + +Affected processors +-------------------- +Core models (desktop, mobile, Xeon-E3) that implement RDRAND and/or RDSEED may +be affected. + +A processor is affected by SRBDS if its Family_Model and stepping is +in the following list, with the exception of the listed processors +exporting MDS_NO while Intel TSX is available yet not enabled. The +latter class of processors are only affected when Intel TSX is enabled +by software using TSX_CTRL_MSR otherwise they are not affected. + + ============= ============ ======== + common name Family_Model Stepping + ============= ============ ======== + IvyBridge 06_3AH All + + Haswell 06_3CH All + Haswell_L 06_45H All + Haswell_G 06_46H All + + Broadwell_G 06_47H All + Broadwell 06_3DH All + + Skylake_L 06_4EH All + Skylake 06_5EH All + + Kabylake_L 06_8EH <= 0xC + Kabylake 06_9EH <= 0xD + ============= ============ ======== + +Related CVEs +------------ + +The following CVE entry is related to this SRBDS issue: + + ============== ===== ===================================== + CVE-2020-0543 SRBDS Special Register Buffer Data Sampling + ============== ===== ===================================== + +Attack scenarios +---------------- +An unprivileged user can extract values returned from RDRAND and RDSEED +executed on another core or sibling thread using MDS techniques. + + +Mitigation mechanism +------------------- +Intel will release microcode updates that modify the RDRAND, RDSEED, and +EGETKEY instructions to overwrite secret special register data in the shared +staging buffer before the secret data can be accessed by another logical +processor. + +During execution of the RDRAND, RDSEED, or EGETKEY instructions, off-core +accesses from other logical processors will be delayed until the special +register read is complete and the secret data in the shared staging buffer is +overwritten. + +This has three effects on performance: + +#. RDRAND, RDSEED, or EGETKEY instructions have higher latency. + +#. Executing RDRAND at the same time on multiple logical processors will be + serialized, resulting in an overall reduction in the maximum RDRAND + bandwidth. + +#. Executing RDRAND, RDSEED or EGETKEY will delay memory accesses from other + logical processors that miss their core caches, with an impact similar to + legacy locked cache-line-split accesses. + +The microcode updates provide an opt-out mechanism (RNGDS_MITG_DIS) to disable +the mitigation for RDRAND and RDSEED instructions executed outside of Intel +Software Guard Extensions (Intel SGX) enclaves. On logical processors that +disable the mitigation using this opt-out mechanism, RDRAND and RDSEED do not +take longer to execute and do not impact performance of sibling logical +processors memory accesses. The opt-out mechanism does not affect Intel SGX +enclaves (including execution of RDRAND or RDSEED inside an enclave, as well +as EGETKEY execution). + +IA32_MCU_OPT_CTRL MSR Definition +-------------------------------- +Along with the mitigation for this issue, Intel added a new thread-scope +IA32_MCU_OPT_CTRL MSR, (address 0x123). The presence of this MSR and +RNGDS_MITG_DIS (bit 0) is enumerated by CPUID.(EAX=07H,ECX=0).EDX[SRBDS_CTRL = +9]==1. This MSR is introduced through the microcode update. + +Setting IA32_MCU_OPT_CTRL[0] (RNGDS_MITG_DIS) to 1 for a logical processor +disables the mitigation for RDRAND and RDSEED executed outside of an Intel SGX +enclave on that logical processor. Opting out of the mitigation for a +particular logical processor does not affect the RDRAND and RDSEED mitigations +for other logical processors. + +Note that inside of an Intel SGX enclave, the mitigation is applied regardless +of the value of RNGDS_MITG_DS. + +Mitigation control on the kernel command line +--------------------------------------------- +The kernel command line allows control over the SRBDS mitigation at boot time +with the option "srbds=". The option for this is: + + ============= ============================================================= + off This option disables SRBDS mitigation for RDRAND and RDSEED on + affected platforms. + ============= ============================================================= + +SRBDS System Information +----------------------- +The Linux kernel provides vulnerability status information through sysfs. For +SRBDS this can be accessed by the following sysfs file: +/sys/devices/system/cpu/vulnerabilities/srbds + +The possible values contained in this file are: + + ============================== ============================================= + Not affected Processor not vulnerable + Vulnerable Processor vulnerable and mitigation disabled + Vulnerable: No microcode Processor vulnerable and microcode is missing + mitigation + Mitigation: Microcode Processor is vulnerable and mitigation is in + effect. + Mitigation: TSX disabled Processor is only vulnerable when TSX is + enabled while this system was booted with TSX + disabled. + Unknown: Dependent on + hypervisor status Running on virtual guest processor that is + affected but with no way to know if host + processor is mitigated or vulnerable. + ============================== ============================================= + +SRBDS Default mitigation +------------------------ +This new microcode serializes processor access during execution of RDRAND, +RDSEED ensures that the shared buffer is overwritten before it is released for +reuse. Use the "srbds=off" kernel command line to disable the mitigation for +RDRAND and RDSEED. diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index ac7e131d293503e26bd70cd2c7ed655c168c9301..2da65fef2a1c63d9f6f5252c45f1304e423f725e 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -521,6 +521,14 @@ will cause a kdump to occur at the panic() call. In cases where a user wants to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1 to achieve the same behaviour. +Trigger Kdump on add_taint() +============================ + +The kernel parameter panic_on_taint facilitates a conditional call to panic() +from within add_taint() whenever the value set in this bitmask matches with the +bit flag being set by add_taint(). +This will cause a kdump to occur at the add_taint()->panic() call. + Contact ======= diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f3eeecbb3f636c528a7e8a4b195cd610af38cd03..fb95fad81c79a065e728202a34a3dabac5653cb4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1445,7 +1445,7 @@ hardlockup_all_cpu_backtrace= [KNL] Should the hard-lockup detector generate backtraces on all cpus. - Format: + Format: 0 | 1 hashdist= [KNL,NUMA] Large hashes allocated during boot are distributed across NUMA nodes. Defaults on @@ -1513,9 +1513,9 @@ hung_task_panic= [KNL] Should the hung task detector generate panics. - Format: + Format: 0 | 1 - A nonzero value instructs the kernel to panic when a + A value of 1 instructs the kernel to panic when a hung task is detected. The default value is controlled by the CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time option. The value selected by this boot parameter can @@ -3447,6 +3447,19 @@ bit 4: print ftrace buffer bit 5: print all printk messages in buffer + panic_on_taint= Bitmask for conditionally calling panic() in add_taint() + Format: [,nousertaint] + Hexadecimal bitmask representing the set of TAINT flags + that will cause the kernel to panic when add_taint() is + called with any of the flags in this set. + The optional switch "nousertaint" can be utilized to + prevent userspace forced crashes by writing to sysctl + /proc/sys/kernel/tainted any flagset matching with the + bitmask set on panic_on_taint. + See Documentation/admin-guide/tainted-kernels.rst for + extra details on the taint flags that users can pick + to compose the bitmask to assign to panic_on_taint. + panic_on_warn panic() instead of WARN(). Useful to cause kdump on a WARN(). @@ -3715,6 +3728,8 @@ may put more devices in an IOMMU group. force_floating [S390] Force usage of floating interrupts. nomio [S390] Do not use MIO instructions. + norid [S390] ignore the RID field and force use of + one PCI domain per PCI function pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power Management. @@ -4652,9 +4667,9 @@ softlockup_panic= [KNL] Should the soft-lockup detector generate panics. - Format: + Format: 0 | 1 - A nonzero value instructs the soft-lockup detector + A value of 1 instructs the soft-lockup detector to panic the machine when a soft-lockup occurs. It is also controlled by the kernel.softlockup_panic sysctl and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the @@ -4663,7 +4678,7 @@ softlockup_all_cpu_backtrace= [KNL] Should the soft-lockup detector generate backtraces on all cpus. - Format: + Format: 0 | 1 sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/admin-guide/laptops/sonypi.rst @@ -4822,6 +4837,26 @@ the kernel will oops in either "warn" or "fatal" mode. + srbds= [X86,INTEL] + Control the Special Register Buffer Data Sampling + (SRBDS) mitigation. + + Certain CPUs are vulnerable to an MDS-like + exploit which can leak bits from the random + number generator. + + By default, this issue is mitigated by + microcode. However, the microcode fix can cause + the RDRAND and RDSEED instructions to become + much slower. Among other effects, this will + result in reduced throughput from /dev/urandom. + + The microcode mitigation can be disabled with + the following option: + + off: Disable mitigation and remove + performance impact to RDRAND and RDSEED + srcutree.counter_wrap_check [KNL] Specifies how frequently to check for grace-period sequence counter wrap for the @@ -4956,6 +4991,15 @@ switches= [HW,M68k] + sysctl.*= [KNL] + Set a sysctl parameter, right before loading the init + process, as if the value was written to the respective + /proc/sys/... file. Both '.' and '/' are recognized as + separators. Unrecognized parameters and invalid values + are reported in the kernel log. Sysctls registered + later by a loaded module cannot be set this way. + Example: sysctl.vm.swappiness=40 + sysfs.deprecated=0|1 [KNL] Enable/disable old style sysfs layout for old udev on older distributions. When this option is enabled diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index 8463f5538fda1fad6f5e341931aa64fea3f273bd..067a90a1499c49f3ffe20a0e10829fbf2a8ac661 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst @@ -364,19 +364,19 @@ follows: 2) for querying the policy, we do not need to take an extra reference on the target task's task policy nor vma policies because we always acquire the - task's mm's mmap_sem for read during the query. The set_mempolicy() and - mbind() APIs [see below] always acquire the mmap_sem for write when + task's mm's mmap_lock for read during the query. The set_mempolicy() and + mbind() APIs [see below] always acquire the mmap_lock for write when installing or replacing task or vma policies. Thus, there is no possibility of a task or thread freeing a policy while another task or thread is querying it. 3) Page allocation usage of task or vma policy occurs in the fault path where - we hold them mmap_sem for read. Again, because replacing the task or vma - policy requires that the mmap_sem be held for write, the policy can't be + we hold them mmap_lock for read. Again, because replacing the task or vma + policy requires that the mmap_lock be held for write, the policy can't be freed out from under us while we're using it for page allocation. 4) Shared policies require special consideration. One task can replace a - shared memory policy while another task, with a distinct mmap_sem, is + shared memory policy while another task, with a distinct mmap_lock, is querying or allocating a page based on the policy. To resolve this potential race, the shared policy infrastructure adds an extra reference to the shared policy during lookup while holding a spin lock on the shared diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index 0bf49d7313ad8a6c3eed751dab224c301d7a3f7d..1dc2d5f823b4246feb5077ba332523a98f037567 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -33,7 +33,7 @@ memory ranges) provides two primary functionalities: The real advantage of userfaults if compared to regular virtual memory management of mremap/mprotect is that the userfaults in all their operations never involve heavyweight structures like vmas (in fact the -``userfaultfd`` runtime load never takes the mmap_sem for writing). +``userfaultfd`` runtime load never takes the mmap_lock for writing). Vmas are not suitable for page- (or hugepage) granular fault tracking when dealing with virtual address spaces that could span diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 1ebf68d0114110c29b9a62c35864667d5668a431..83acf50254886b49e6d6102d0eaf3fb16e145820 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -335,6 +335,20 @@ Path for the hotplug policy agent. Default value is "``/sbin/hotplug``". +hung_task_all_cpu_backtrace: +================ + +If this option is set, the kernel will send an NMI to all CPUs to dump +their backtraces when a hung task is detected. This file shows up if +CONFIG_DETECT_HUNG_TASK and CONFIG_SMP are enabled. + +0: Won't show all CPUs backtraces when a hung task is detected. +This is the default behavior. + +1: Will non-maskably interrupt all CPUs and dump their backtraces when +a hung task is detected. + + hung_task_panic =============== @@ -632,6 +646,22 @@ rate for each task. scanned for a given scan. +oops_all_cpu_backtrace: +================ + +If this option is set, the kernel will send an NMI to all CPUs to dump +their backtraces when an oops event occurs. It should be used as a last +resort in case a panic cannot be triggered (to protect VMs running, for +example) or kdump can't be collected. This file shows up if CONFIG_SMP +is enabled. + +0: Won't show all CPUs backtraces when an oops is detected. +This is the default behavior. + +1: Will non-maskably interrupt all CPUs and dump their backtraces when +an oops event is detected. + + osrelease, ostype & version =========================== @@ -1239,6 +1269,13 @@ ORed together. The letters are seen in "Tainted" line of Oops reports. See :doc:`/admin-guide/tainted-kernels` for more information. +Note: + writes to this sysctl interface will fail with ``EINVAL`` if the kernel is + booted with the command line option ``panic_on_taint=,nousertaint`` + and any of the ORed together values being written to ``tainted`` match with + the bitmask declared on panic_on_taint. + See :doc:`/admin-guide/kernel-parameters` for more details on that particular + kernel command line option and its optional ``nousertaint`` switch. threads-max =========== diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index 2e939ff10b86c6a0ba98b031abc93d810a4e65fe..6068266dd303f4ccae5dcc75539ca9b03faba4b1 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -148,23 +148,46 @@ NOTE: Some pages, such as DAX pages, cannot be pinned with longterm pins. That's because DAX pages do not have a separate page cache, and so "pinning" implies locking down file system blocks, which is not (yet) supported in that way. -CASE 3: Hardware with page faulting support -------------------------------------------- -Here, a well-written driver doesn't normally need to pin pages at all. However, -if the driver does choose to do so, it can register MMU notifiers for the range, -and will be called back upon invalidation. Either way (avoiding page pinning, or -using MMU notifiers to unpin upon request), there is proper synchronization with -both filesystem and mm (page_mkclean(), munmap(), etc). - -Therefore, neither flag needs to be set. - -In this case, ideally, neither get_user_pages() nor pin_user_pages() should be -called. Instead, the software should be written so that it does not pin pages. -This allows mm and filesystems to operate more efficiently and reliably. +CASE 3: MMU notifier registration, with or without page faulting hardware +------------------------------------------------------------------------- +Device drivers can pin pages via get_user_pages*(), and register for mmu +notifier callbacks for the memory range. Then, upon receiving a notifier +"invalidate range" callback , stop the device from using the range, and unpin +the pages. There may be other possible schemes, such as for example explicitly +synchronizing against pending IO, that accomplish approximately the same thing. + +Or, if the hardware supports replayable page faults, then the device driver can +avoid pinning entirely (this is ideal), as follows: register for mmu notifier +callbacks as above, but instead of stopping the device and unpinning in the +callback, simply remove the range from the device's page tables. + +Either way, as long as the driver unpins the pages upon mmu notifier callback, +then there is proper synchronization with both filesystem and mm +(page_mkclean(), munmap(), etc). Therefore, neither flag needs to be set. CASE 4: Pinning for struct page manipulation only ------------------------------------------------- -Here, normal GUP calls are sufficient, so neither flag needs to be set. +If only struct page data (as opposed to the actual memory contents that a page +is tracking) is affected, then normal GUP calls are sufficient, and neither flag +needs to be set. + +CASE 5: Pinning in order to write to the data within the page +------------------------------------------------------------- +Even though neither DMA nor Direct IO is involved, just a simple case of "pin, +write to a page's data, unpin" can cause a problem. Case 5 may be considered a +superset of Case 1, plus Case 2, plus anything that invokes that pattern. In +other words, if the code is neither Case 1 nor Case 2, it may still require +FOLL_PIN, for patterns like this: + +Correct (uses FOLL_PIN calls): + pin_user_pages() + write to the data within the pages + unpin_user_pages() + +INCORRECT (uses FOLL_GET calls): + get_user_pages() + write to the data within the pages + put_page() page_maybe_dma_pinned(): the whole point of pinning =================================================== diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst index 5d1f56fcd2e7a2cfb8194288b34704b19d06848c..469d115a95f120fb38387d1668482fe5f98452ca 100644 --- a/Documentation/dev-tools/kselftest.rst +++ b/Documentation/dev-tools/kselftest.rst @@ -151,6 +151,29 @@ note some tests will require root privileges:: $ cd kselftest $ ./run_kselftest.sh +Packaging selftests +=================== + +In some cases packaging is desired, such as when tests need to run on a +different system. To package selftests, run:: + + $ make -C tools/testing/selftests gen_tar + +This generates a tarball in the `INSTALL_PATH/kselftest-packages` directory. By +default, `.gz` format is used. The tar format can be overridden by specifying +a `FORMAT` make variable. Any value recognized by `tar's auto-compress`_ option +is supported, such as:: + + $ make -C tools/testing/selftests gen_tar FORMAT=.xz + +`make gen_tar` invokes `make install` so you can use it to package a subset of +tests by using variables specified in `Running a subset of selftests`_ +section:: + + $ make -C tools/testing/selftests gen_tar TARGETS="bpf" FORMAT=.xz + +.. _tar's auto-compress: https://www.gnu.org/software/tar/manual/html_node/gzip.html#auto_002dcompress + Contributing new tests ====================== diff --git a/Documentation/dev-tools/kunit/start.rst b/Documentation/dev-tools/kunit/start.rst index e1c5ce80ce12bb4a9e3abe78baecbc157ab426fe..bb112cf70624ea0ea480e7aa4406b8f386ccdd73 100644 --- a/Documentation/dev-tools/kunit/start.rst +++ b/Documentation/dev-tools/kunit/start.rst @@ -32,15 +32,17 @@ test targets as well. The ``.kunitconfig`` should also contain any other config options required by the tests. A good starting point for a ``.kunitconfig`` is the KUnit defconfig: + .. code-block:: bash cd $PATH_TO_LINUX_REPO cp arch/um/configs/kunit_defconfig .kunitconfig You can then add any other Kconfig options you wish, e.g.: + .. code-block:: none - CONFIG_LIST_KUNIT_TEST=y + CONFIG_LIST_KUNIT_TEST=y :doc:`kunit_tool ` will ensure that all config options set in ``.kunitconfig`` are set in the kernel ``.config`` before running the tests. @@ -54,8 +56,8 @@ using. other tools (such as make menuconfig) to adjust other config options. -Running the tests ------------------ +Running the tests (KUnit Wrapper) +--------------------------------- To make sure that everything is set up correctly, simply invoke the Python wrapper from your kernel repo: @@ -105,8 +107,9 @@ have config options ending in ``_KUNIT_TEST``. KUnit and KUnit tests can be compiled as modules: in this case the tests in a module will be run when the module is loaded. -Running the tests ------------------ + +Running the tests (w/o KUnit Wrapper) +------------------------------------- Build and run your kernel as usual. Test output will be written to the kernel log in `TAP `_ format. diff --git a/Documentation/dev-tools/kunit/usage.rst b/Documentation/dev-tools/kunit/usage.rst index 473a2361ec375ecf338d964f9a66aaa41f945c7e..3c3fe8b5feccf8fbdcade6c697e19a0bc14e5541 100644 --- a/Documentation/dev-tools/kunit/usage.rst +++ b/Documentation/dev-tools/kunit/usage.rst @@ -595,7 +595,7 @@ able to run one test case per invocation. KUnit debugfs representation ============================ When kunit test suites are initialized, they create an associated directory -in /sys/kernel/debug/kunit/. The directory contains one file +in ``/sys/kernel/debug/kunit/``. The directory contains one file - results: "cat results" displays results of each test case and the results of the entire suite for the last test run. @@ -604,4 +604,4 @@ The debugfs representation is primarily of use when kunit test suites are run in a native environment, either as modules or builtin. Having a way to display results like this is valuable as otherwise results can be intermixed with other events in dmesg output. The maximum size of each -results file is KUNIT_LOG_SIZE bytes (defined in include/kunit/test.h). +results file is KUNIT_LOG_SIZE bytes (defined in ``include/kunit/test.h``). diff --git a/Documentation/devicetree/bindings/iommu/allwinner,sun50i-h6-iommu.yaml b/Documentation/devicetree/bindings/iommu/allwinner,sun50i-h6-iommu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5e125cf2a88b458cd5c3aa7c1b2079bd474d73fc --- /dev/null +++ b/Documentation/devicetree/bindings/iommu/allwinner,sun50i-h6-iommu.yaml @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/iommu/allwinner,sun50i-h6-iommu.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Allwinner H6 IOMMU Device Tree Bindings + +maintainers: + - Chen-Yu Tsai + - Maxime Ripard + +properties: + "#iommu-cells": + const: 1 + description: + The content of the cell is the master ID. + + compatible: + const: allwinner,sun50i-h6-iommu + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + maxItems: 1 + + resets: + maxItems: 1 + +required: + - "#iommu-cells" + - compatible + - reg + - interrupts + - clocks + - resets + +additionalProperties: false + +examples: + - | + #include + #include + + #include + #include + + iommu: iommu@30f0000 { + compatible = "allwinner,sun50i-h6-iommu"; + reg = <0x030f0000 0x10000>; + interrupts = ; + clocks = <&ccu CLK_BUS_IOMMU>; + resets = <&ccu RST_BUS_IOMMU>; + #iommu-cells = <1>; + }; + +... diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml index 218f7ecb4703df02bd6430b0793336920ec04c05..d7ceb4c34423b82079cea8a22757d38f593e7c12 100644 --- a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml +++ b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml @@ -42,7 +42,9 @@ properties: - const: arm,mmu-500 - const: arm,smmu-v2 - items: - - const: arm,mmu-401 + - enum: + - arm,mmu-400 + - arm,mmu-401 - const: arm,smmu-v1 - enum: - arm,smmu-v1 diff --git a/Documentation/devicetree/bindings/remoteproc/ingenic,vpu.yaml b/Documentation/devicetree/bindings/remoteproc/ingenic,vpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c019f9fbe916b5b71664a2dbef77777315ee87ae --- /dev/null +++ b/Documentation/devicetree/bindings/remoteproc/ingenic,vpu.yaml @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: "http://devicetree.org/schemas/remoteproc/ingenic,vpu.yaml#" +$schema: "http://devicetree.org/meta-schemas/core.yaml#" + +title: Ingenic Video Processing Unit bindings + +description: + Inside the Video Processing Unit (VPU) of the recent JZ47xx SoCs from + Ingenic is a second Xburst MIPS CPU very similar to the main core. + This document describes the devicetree bindings for this auxiliary + processor. + +maintainers: + - Paul Cercueil + +properties: + compatible: + const: ingenic,jz4770-vpu-rproc + + reg: + items: + - description: aux registers + - description: tcsm0 registers + - description: tcsm1 registers + - description: sram registers + + reg-names: + items: + - const: aux + - const: tcsm0 + - const: tcsm1 + - const: sram + + clocks: + items: + - description: aux clock + - description: vpu clock + + clock-names: + items: + - const: aux + - const: vpu + + interrupts: + description: VPU hardware interrupt + +required: + - compatible + - reg + - reg-names + - clocks + - clock-names + - interrupts + +additionalProperties: false + +examples: + - | + #include + + vpu: video-decoder@132a0000 { + compatible = "ingenic,jz4770-vpu-rproc"; + + reg = <0x132a0000 0x20>, /* AUX */ + <0x132b0000 0x4000>, /* TCSM0 */ + <0x132c0000 0xc000>, /* TCSM1 */ + <0x132f0000 0x7000>; /* SRAM */ + reg-names = "aux", "tcsm0", "tcsm1", "sram"; + + clocks = <&cgu JZ4770_CLK_AUX>, <&cgu JZ4770_CLK_VPU>; + clock-names = "aux", "vpu"; + + interrupt-parent = <&cpuintc>; + interrupts = <3>; + }; diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt index 9938918b2fea3e4593e749d556ff43f91310d0ce..54737024da20886e52507852fed25f8be01d8598 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt +++ b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt @@ -15,12 +15,16 @@ on the Qualcomm ADSP Hexagon core. "qcom,qcs404-adsp-pas" "qcom,qcs404-cdsp-pas" "qcom,qcs404-wcss-pas" + "qcom,sc7180-mpss-pas" "qcom,sdm845-adsp-pas" "qcom,sdm845-cdsp-pas" "qcom,sm8150-adsp-pas" "qcom,sm8150-cdsp-pas" "qcom,sm8150-mpss-pas" "qcom,sm8150-slpi-pas" + "qcom,sm8250-adsp-pas" + "qcom,sm8250-cdsp-pas" + "qcom,sm8250-slpi-pas" - interrupts-extended: Usage: required @@ -44,8 +48,12 @@ on the Qualcomm ADSP Hexagon core. qcom,sm8150-adsp-pas: qcom,sm8150-cdsp-pas: qcom,sm8150-slpi-pas: + qcom,sm8250-adsp-pas: + qcom,sm8250-cdsp-pas: + qcom,sm8250-slpi-pas: must be "wdog", "fatal", "ready", "handover", "stop-ack" qcom,qcs404-wcss-pas: + qcom,sc7180-mpss-pas: qcom,sm8150-mpss-pas: must be "wdog", "fatal", "ready", "handover", "stop-ack", "shutdown-ack" @@ -105,10 +113,14 @@ on the Qualcomm ADSP Hexagon core. qcom,sdm845-cdsp-pas: qcom,sm8150-adsp-pas: qcom,sm8150-cdsp-pas: + qcom,sm8250-cdsp-pas: must be "cx", "load_state" + qcom,sc7180-mpss-pas: qcom,sm8150-mpss-pas: must be "cx", "load_state", "mss" + qcom,sm8250-adsp-pas: qcom,sm8150-slpi-pas: + qcom,sm8250-slpi-pas: must be "lcx", "lmx", "load_state" - memory-region: diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt b/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt index 88dfa3fc15f7837e9b9de33afc84de97ea89d497..1f9a62e13ebe0215324ecb13f12a9b6836fd2a1c 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt +++ b/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt @@ -79,7 +79,7 @@ on the Qualcomm Hexagon core. "snoc_axi", "mnoc_axi", "qdss" qcom,sc7180-mss-pil: must be "iface", "bus", "xo", "snoc_axi", "mnoc_axi", - "mss_crypto", "mss_nav", "nav" + "nav" qcom,sdm845-mss-pil: must be "iface", "bus", "mem", "xo", "gpll0_mss", "snoc_axi", "mnoc_axi", "prng" @@ -102,6 +102,14 @@ on the Qualcomm Hexagon core. must be "mss_restart", "pdc_reset" for the modem sub-system on SC7180, SDM845 SoCs +For devices where the mba and mpss sub-nodes are not specified, mba/mpss region +should be referenced as follows: +- memory-region: + Usage: required + Value type: + Definition: reference to the reserved-memory for the mba region followed + by the mpss region + For the compatible strings below the following supplies are required: "qcom,q6v5-pil" "qcom,msm8916-mss-pil", @@ -173,16 +181,15 @@ For the compatible string below the following supplies are required: For the compatible strings below the following phandle references are required: "qcom,sc7180-mss-pil" -- qcom,halt-nav-regs: +- qcom,spare-regs: Usage: required Value type: - Definition: reference to a list of 2 phandles with one offset each for - the modem sub-system running on SC7180 SoC. The first - phandle reference is to the mss clock node followed by the - offset within register space for nav halt register. The - second phandle reference is to a syscon representing TCSR - followed by the offset within syscon for conn_box_spare0 - register. + Definition: a phandle reference to a syscon representing TCSR followed + by the offset within syscon for conn_box_spare0 register + used by the modem sub-system running on SC7180 SoC. + +The Hexagon node must contain iommus property as described in ../iommu/iommu.txt +on platforms which do not have TrustZone. = SUBNODES: The Hexagon node must contain two subnodes, named "mba" and "mpss" representing diff --git a/Documentation/filesystems/gfs2-glocks.txt b/Documentation/filesystems/gfs2-glocks.rst similarity index 63% rename from Documentation/filesystems/gfs2-glocks.txt rename to Documentation/filesystems/gfs2-glocks.rst index 7059623635b2d32fdaac02856447d6ab851eb04e..d14f230f0b1234ea035dbba93fd5229707ef95d7 100644 --- a/Documentation/filesystems/gfs2-glocks.txt +++ b/Documentation/filesystems/gfs2-glocks.rst @@ -1,5 +1,8 @@ - Glock internal locking rules - ------------------------------ +.. SPDX-License-Identifier: GPL-2.0 + +============================ +Glock internal locking rules +============================ This documents the basic principles of the glock state machine internals. Each glock (struct gfs2_glock in fs/gfs2/incore.h) @@ -24,24 +27,28 @@ There are three lock states that users of the glock layer can request, namely shared (SH), deferred (DF) and exclusive (EX). Those translate to the following DLM lock modes: -Glock mode | DLM lock mode ------------------------------- - UN | IV/NL Unlocked (no DLM lock associated with glock) or NL - SH | PR (Protected read) - DF | CW (Concurrent write) - EX | EX (Exclusive) +========== ====== ===================================================== +Glock mode DLM lock mode +========== ====== ===================================================== + UN IV/NL Unlocked (no DLM lock associated with glock) or NL + SH PR (Protected read) + DF CW (Concurrent write) + EX EX (Exclusive) +========== ====== ===================================================== Thus DF is basically a shared mode which is incompatible with the "normal" shared lock mode, SH. In GFS2 the DF mode is used exclusively for direct I/O operations. The glocks are basically a lock plus some routines which deal with cache management. The following rules apply for the cache: -Glock mode | Cache data | Cache Metadata | Dirty Data | Dirty Metadata --------------------------------------------------------------------------- - UN | No | No | No | No - SH | Yes | Yes | No | No - DF | No | Yes | No | No - EX | Yes | Yes | Yes | Yes +========== ========== ============== ========== ============== +Glock mode Cache data Cache Metadata Dirty Data Dirty Metadata +========== ========== ============== ========== ============== + UN No No No No + SH Yes Yes No No + DF No Yes No No + EX Yes Yes Yes Yes +========== ========== ============== ========== ============== These rules are implemented using the various glock operations which are defined for each type of glock. Not all types of glocks use @@ -49,21 +56,23 @@ all the modes. Only inode glocks use the DF mode for example. Table of glock operations and per type constants: -Field | Purpose ----------------------------------------------------------------------------- -go_xmote_th | Called before remote state change (e.g. to sync dirty data) -go_xmote_bh | Called after remote state change (e.g. to refill cache) -go_inval | Called if remote state change requires invalidating the cache -go_demote_ok | Returns boolean value of whether its ok to demote a glock - | (e.g. checks timeout, and that there is no cached data) -go_lock | Called for the first local holder of a lock -go_unlock | Called on the final local unlock of a lock -go_dump | Called to print content of object for debugfs file, or on - | error to dump glock to the log. -go_type | The type of the glock, LM_TYPE_..... -go_callback | Called if the DLM sends a callback to drop this lock -go_flags | GLOF_ASPACE is set, if the glock has an address space - | associated with it +============= ============================================================= +Field Purpose +============= ============================================================= +go_xmote_th Called before remote state change (e.g. to sync dirty data) +go_xmote_bh Called after remote state change (e.g. to refill cache) +go_inval Called if remote state change requires invalidating the cache +go_demote_ok Returns boolean value of whether its ok to demote a glock + (e.g. checks timeout, and that there is no cached data) +go_lock Called for the first local holder of a lock +go_unlock Called on the final local unlock of a lock +go_dump Called to print content of object for debugfs file, or on + error to dump glock to the log. +go_type The type of the glock, ``LM_TYPE_*`` +go_callback Called if the DLM sends a callback to drop this lock +go_flags GLOF_ASPACE is set, if the glock has an address space + associated with it +============= ============================================================= The minimum hold time for each lock is the time after a remote lock grant for which we ignore remote demote requests. This is in order to @@ -82,21 +91,25 @@ rather than via the glock. Locking rules for glock operations: -Operation | GLF_LOCK bit lock held | gl_lockref.lock spinlock held -------------------------------------------------------------------------- -go_xmote_th | Yes | No -go_xmote_bh | Yes | No -go_inval | Yes | No -go_demote_ok | Sometimes | Yes -go_lock | Yes | No -go_unlock | Yes | No -go_dump | Sometimes | Yes -go_callback | Sometimes (N/A) | Yes - -N.B. Operations must not drop either the bit lock or the spinlock -if its held on entry. go_dump and do_demote_ok must never block. -Note that go_dump will only be called if the glock's state -indicates that it is caching uptodate data. +============= ====================== ============================= +Operation GLF_LOCK bit lock held gl_lockref.lock spinlock held +============= ====================== ============================= +go_xmote_th Yes No +go_xmote_bh Yes No +go_inval Yes No +go_demote_ok Sometimes Yes +go_lock Yes No +go_unlock Yes No +go_dump Sometimes Yes +go_callback Sometimes (N/A) Yes +============= ====================== ============================= + +.. Note:: + + Operations must not drop either the bit lock or the spinlock + if its held on entry. go_dump and do_demote_ok must never block. + Note that go_dump will only be called if the glock's state + indicates that it is caching uptodate data. Glock locking order within GFS2: @@ -104,7 +117,7 @@ Glock locking order within GFS2: 2. Rename glock (for rename only) 3. Inode glock(s) (Parents before children, inodes at "same level" with same parent in - lock number order) + lock number order) 4. Rgrp glock(s) (for (de)allocation operations) 5. Transaction glock (via gfs2_trans_begin) for non-read operations 6. i_rw_mutex (if required) @@ -117,8 +130,8 @@ determine the lifetime of the inode in question. Locking of inodes is on a per-inode basis. Locking of rgrps is on a per rgrp basis. In general we prefer to lock local locks prior to cluster locks. - Glock Statistics - ------------------ +Glock Statistics +---------------- The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The @@ -173,8 +186,8 @@ we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for -allocation (to base it on lock wait time, rather than blindly -using a "try lock") + allocation (to base it on lock wait time, rather than blindly + using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken @@ -195,10 +208,13 @@ as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. -Per sb stats can be found here: -/sys/kernel/debug/gfs2//sbstats -Per glock stats can be found here: -/sys/kernel/debug/gfs2//glstats +Per sb stats can be found here:: + + /sys/kernel/debug/gfs2//sbstats + +Per glock stats can be found here:: + + /sys/kernel/debug/gfs2//glstats Assuming that debugfs is mounted on /sys/kernel/debug and also that is replaced with the name of the gfs2 filesystem @@ -206,14 +222,16 @@ in question. The abbreviations used in the output as are follows: -srtt - Smoothed round trip time for non-blocking dlm requests -srttvar - Variance estimate for srtt -srttb - Smoothed round trip time for (potentially) blocking dlm requests -srttvarb - Variance estimate for srttb -sirt - Smoothed inter-request time (for dlm requests) -sirtvar - Variance estimate for sirt -dlm - Number of dlm requests made (dcnt in glstats file) -queue - Number of glock requests queued (qcnt in glstats file) +========= ================================================================ +srtt Smoothed round trip time for non blocking dlm requests +srttvar Variance estimate for srtt +srttb Smoothed round trip time for (potentially) blocking dlm requests +srttvarb Variance estimate for srttb +sirt Smoothed inter request time (for dlm requests) +sirtvar Variance estimate for sirt +dlm Number of dlm requests made (dcnt in glstats file) +queue Number of glock requests queued (qcnt in glstats file) +========= ================================================================ The sbstats file contains a set of these stats for each glock type (so 8 lines for each type) and for each cpu (one column per cpu). The glstats file contains @@ -224,9 +242,12 @@ The gfs2_glock_lock_time tracepoint prints out the current values of the stats for the glock in question, along with some addition information on each dlm reply that is received: -status - The status of the dlm request -flags - The dlm request flags -tdiff - The time taken by this specific request +====== ======================================= +status The status of the dlm request +flags The dlm request flags +tdiff The time taken by this specific request +====== ======================================= + (remaining fields as per above list) diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 17795341e0a36bc50f8a6956d1470e2b0d9fbd6f..4c536e66dc4c8558052a02102544451ede9f40e1 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -88,6 +88,7 @@ Documentation for filesystem implementations. f2fs gfs2 gfs2-uevents + gfs2-glocks hfs hfsplus hpfs diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 67657f216942dc2914744a3f10a1c50a87369532..0a23fe922f428756338e6381c21210860dd2b542 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -615,7 +615,7 @@ prototypes:: locking rules: ============= ======== =========================== -ops mmap_sem PageLocked(page) +ops mmap_lock PageLocked(page) ============= ======== =========================== open: yes close: yes diff --git a/Documentation/s390/index.rst b/Documentation/s390/index.rst index f7af2061e406573cfbe6296c12ec67e8052797ee..cf71df5776b4837d4bc143662956dec4d17ea424 100644 --- a/Documentation/s390/index.rst +++ b/Documentation/s390/index.rst @@ -15,6 +15,7 @@ s390 Architecture vfio-ccw zfcpdump common_io + pci text_files diff --git a/Documentation/s390/pci.rst b/Documentation/s390/pci.rst new file mode 100644 index 0000000000000000000000000000000000000000..492850bff3168bb5157ffe6158f90e2be2917815 --- /dev/null +++ b/Documentation/s390/pci.rst @@ -0,0 +1,125 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========= +S/390 PCI +========= + +Authors: + - Pierre Morel + +Copyright, IBM Corp. 2020 + + +Command line parameters and debugfs entries +=========================================== + +Command line parameters +----------------------- + +* nomio + + Do not use PCI Mapped I/O (MIO) instructions. + +* norid + + Ignore the RID field and force use of one PCI domain per PCI function. + +debugfs entries +--------------- + +The S/390 debug feature (s390dbf) generates views to hold various debug results in sysfs directories of the form: + + * /sys/kernel/debug/s390dbf/pci_*/ + +For example: + + - /sys/kernel/debug/s390dbf/pci_msg/sprintf + Holds messages from the processing of PCI events, like machine check handling + and setting of global functionality, like UID checking. + + Change the level of logging to be more or less verbose by piping + a number between 0 and 6 to /sys/kernel/debug/s390dbf/pci_*/level. For + details, see the documentation on the S/390 debug feature at + Documentation/s390/s390dbf.rst. + +Sysfs entries +============= + +Entries specific to zPCI functions and entries that hold zPCI information. + +* /sys/bus/pci/slots/XXXXXXXX + + The slot entries are set up using the function identifier (FID) of the + PCI function. + + - /sys/bus/pci/slots/XXXXXXXX/power + + A physical function that currently supports a virtual function cannot be + powered off until all virtual functions are removed with: + echo 0 > /sys/bus/pci/devices/XXXX:XX:XX.X/sriov_numvf + +* /sys/bus/pci/devices/XXXX:XX:XX.X/ + + - function_id + A zPCI function identifier that uniquely identifies the function in the Z server. + + - function_handle + Low-level identifier used for a configured PCI function. + It might be useful for debuging. + + - pchid + Model-dependent location of the I/O adapter. + + - pfgid + PCI function group ID, functions that share identical functionality + use a common identifier. + A PCI group defines interrupts, IOMMU, IOTLB, and DMA specifics. + + - vfn + The virtual function number, from 1 to N for virtual functions, + 0 for physical functions. + + - pft + The PCI function type + + - port + The port corresponds to the physical port the function is attached to. + It also gives an indication of the physical function a virtual function + is attached to. + + - uid + The unique identifier (UID) is defined when configuring an LPAR and is + unique in the LPAR. + + - pfip/segmentX + The segments determine the isolation of a function. + They correspond to the physical path to the function. + The more the segments are different, the more the functions are isolated. + +Enumeration and hotplug +======================= + +The PCI address consists of four parts: domain, bus, device and function, +and is of this form: DDDD:BB:dd.f + +* When not using multi-functions (norid is set, or the firmware does not + support multi-functions): + + - There is only one function per domain. + + - The domain is set from the zPCI function's UID as defined during the + LPAR creation. + +* When using multi-functions (norid parameter is not set), + zPCI functions are addressed differently: + + - There is still only one bus per domain. + + - There can be up to 256 functions per bus. + + - The domain part of the address of all functions for + a multi-Function device is set from the zPCI function's UID as defined + in the LPAR creation for the function zero. + + - New functions will only be ready for use after the function zero + (the function with devfn 0) has been enumerated. diff --git a/Documentation/s390/vfio-ccw.rst b/Documentation/s390/vfio-ccw.rst index fca9c4f5bd9c0162778dff70b2514d4631768564..8aad08a8b8a50384dba9c500efc82e4b1679fdf6 100644 --- a/Documentation/s390/vfio-ccw.rst +++ b/Documentation/s390/vfio-ccw.rst @@ -204,15 +204,44 @@ definition of the region is:: __u32 ret_code; } __packed; +This region is always available. + While starting an I/O request, orb_area should be filled with the guest ORB, and scsw_area should be filled with the SCSW of the Virtual Subchannel. irb_area stores the I/O result. -ret_code stores a return code for each access of the region. +ret_code stores a return code for each access of the region. The following +values may occur: + +``0`` + The operation was successful. + +``-EOPNOTSUPP`` + The orb specified transport mode or an unidentified IDAW format, or the + scsw specified a function other than the start function. + +``-EIO`` + A request was issued while the device was not in a state ready to accept + requests, or an internal error occurred. + +``-EBUSY`` + The subchannel was status pending or busy, or a request is already active. + +``-EAGAIN`` + A request was being processed, and the caller should retry. + +``-EACCES`` + The channel path(s) used for the I/O were found to be not operational. + +``-ENODEV`` + The device was found to be not operational. + +``-EINVAL`` + The orb specified a chain longer than 255 ccws, or an internal error + occurred. -This region is always available. vfio-ccw cmd region ------------------- @@ -231,6 +260,64 @@ This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD. Currently, CLEAR SUBCHANNEL and HALT SUBCHANNEL use this region. +command specifies the command to be issued; ret_code stores a return code +for each access of the region. The following values may occur: + +``0`` + The operation was successful. + +``-ENODEV`` + The device was found to be not operational. + +``-EINVAL`` + A command other than halt or clear was specified. + +``-EIO`` + A request was issued while the device was not in a state ready to accept + requests. + +``-EAGAIN`` + A request was being processed, and the caller should retry. + +``-EBUSY`` + The subchannel was status pending or busy while processing a halt request. + +vfio-ccw schib region +--------------------- + +The vfio-ccw schib region is used to return Subchannel-Information +Block (SCHIB) data to userspace:: + + struct ccw_schib_region { + #define SCHIB_AREA_SIZE 52 + __u8 schib_area[SCHIB_AREA_SIZE]; + } __packed; + +This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_SCHIB. + +Reading this region triggers a STORE SUBCHANNEL to be issued to the +associated hardware. + +vfio-ccw crw region +--------------------- + +The vfio-ccw crw region is used to return Channel Report Word (CRW) +data to userspace:: + + struct ccw_crw_region { + __u32 crw; + __u32 pad; + } __packed; + +This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_CRW. + +Reading this region returns a CRW if one that is relevant for this +subchannel (e.g. one reporting changes in channel path state) is +pending, or all zeroes if not. If multiple CRWs are pending (including +possibly chained CRWs), reading this region again will return the next +one, until no more CRWs are pending and zeroes are returned. This is +similar to how STORE CHANNEL REPORT WORD works. + vfio-ccw operation details -------------------------- @@ -333,7 +420,14 @@ through DASD/ECKD device online in a guest now and use it as a block device. The current code allows the guest to start channel programs via -START SUBCHANNEL, and to issue HALT SUBCHANNEL and CLEAR SUBCHANNEL. +START SUBCHANNEL, and to issue HALT SUBCHANNEL, CLEAR SUBCHANNEL, +and STORE SUBCHANNEL. + +Currently all channel programs are prefetched, regardless of the +p-bit setting in the ORB. As a result, self modifying channel +programs are not supported. For this reason, IPL has to be handled as +a special case by a userspace/guest program; this has been implemented +in QEMU's s390-ccw bios as of QEMU 4.1. vfio-ccw supports classic (command mode) channel I/O only. Transport mode (HPF) is not supported. diff --git a/Documentation/s390/zfcpdump.rst b/Documentation/s390/zfcpdump.rst index 54e8e7caf7e76a25099f7db5d71b3d5955ec658d..a61de7aa8778f7aae9b5318b2c1d7a1e2c1239ff 100644 --- a/Documentation/s390/zfcpdump.rst +++ b/Documentation/s390/zfcpdump.rst @@ -46,5 +46,5 @@ initramfs with a user space application that writes the dump to a SCSI partition. For more information on how to use zfcpdump refer to the s390 'Using the Dump -Tools book', which is available from -http://www.ibm.com/developerworks/linux/linux390. +Tools' book, which is available from IBM Knowledge Center: +https://www.ibm.com/support/knowledgecenter/linuxonibm/liaaf/lnz_r_dt.html diff --git a/Documentation/trace/histogram-design.rst b/Documentation/trace/histogram-design.rst new file mode 100644 index 0000000000000000000000000000000000000000..eef840043da9c6a3df348932fbe19906a5e7ca7e --- /dev/null +++ b/Documentation/trace/histogram-design.rst @@ -0,0 +1,2115 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +Histogram Design Notes +====================== + +:Author: Tom Zanussi + +This document attempts to provide a description of how the ftrace +histograms work and how the individual pieces map to the data +structures used to implement them in trace_events_hist.c and +tracing_map.c. + +Note: All the ftrace histogram command examples assume the working + directory is the ftrace /tracing directory. For example:: + + # cd /sys/kernel/debug/tracing + +Also, the histogram output displayed for those commands will be +generally be truncated - only enough to make the point is displayed. + +'hist_debug' trace event files +============================== + +If the kernel is compiled with CONFIG_HIST_TRIGGERS_DEBUG set, an +event file named 'hist_debug' will appear in each event's +subdirectory. This file can be read at any time and will display some +of the hist trigger internals described in this document. Specific +examples and output will be described in test cases below. + +Basic histograms +================ + +First, basic histograms. Below is pretty much the simplest thing you +can do with histograms - create one with a single key on a single +event and cat the output:: + + # echo 'hist:keys=pid' >> events/sched/sched_waking/trigger + + # cat events/sched/sched_waking/hist + + { pid: 18249 } hitcount: 1 + { pid: 13399 } hitcount: 1 + { pid: 17973 } hitcount: 1 + { pid: 12572 } hitcount: 1 + ... + { pid: 10 } hitcount: 921 + { pid: 18255 } hitcount: 1444 + { pid: 25526 } hitcount: 2055 + { pid: 5257 } hitcount: 2055 + { pid: 27367 } hitcount: 2055 + { pid: 1728 } hitcount: 2161 + + Totals: + Hits: 21305 + Entries: 183 + Dropped: 0 + +What this does is create a histogram on the sched_waking event using +pid as a key and with a single value, hitcount, which even if not +explicitly specified, exists for every histogram regardless. + +The hitcount value is a per-bucket value that's automatically +incremented on every hit for the given key, which in this case is the +pid. + +So in this histogram, there's a separate bucket for each pid, and each +bucket contains a value for that bucket, counting the number of times +sched_waking was called for that pid. + +Each histogram is represented by a hist_data struct. + +To keep track of each key and value field in the histogram, hist_data +keeps an array of these fields named fields[]. The fields[] array is +an array containing struct hist_field representations of each +histogram val and key in the histogram (variables are also included +here, but are discussed later). So for the above histogram we have one +key and one value; in this case the one value is the hitcount value, +which all histograms have, regardless of whether they define that +value or not, which the above histogram does not. + +Each struct hist_field contains a pointer to the ftrace_event_field +from the event's trace_event_file along with various bits related to +that such as the size, offset, type, and a hist_field_fn_t function, +which is used to grab the field's data from the ftrace event buffer +(in most cases - some hist_fields such as hitcount don't directly map +to an event field in the trace buffer - in these cases the function +implementation gets its value from somewhere else). The flags field +indicates which type of field it is - key, value, variable, variable +reference, etc., with value being the default. + +The other important hist_data data structure in addition to the +fields[] array is the tracing_map instance created for the histogram, +which is held in the .map member. The tracing_map implements the +lock-free hash table used to implement histograms (see +kernel/trace/tracing_map.h for much more discussion about the +low-level data structures implementing the tracing_map). For the +purposes of this discussion, the tracing_map contains a number of +buckets, each bucket corresponding to a particular tracing_map_elt +object hashed by a given histogram key. + +Below is a diagram the first part of which describes the hist_data and +associated key and value fields for the histogram described above. As +you can see, there are two fields in the fields array, one val field +for the hitcount and one key field for the pid key. + +Below that is a diagram of a run-time snapshot of what the tracing_map +might look like for a given run. It attempts to show the +relationships between the hist_data fields and the tracing_map +elements for a couple hypothetical keys and values.:: + + +------------------+ + | hist_data | + +------------------+ +----------------+ + | .fields[] |---->| val = hitcount |----------------------------+ + +----------------+ +----------------+ | + | .map | | .size | | + +----------------+ +--------------+ | + | .offset | | + +--------------+ | + | .fn() | | + +--------------+ | + . | + . | + . | + +----------------+ <--- n_vals | + | key = pid |----------------------------|--+ + +----------------+ | | + | .size | | | + +--------------+ | | + | .offset | | | + +--------------+ | | + | .fn() | | | + +----------------+ <--- n_fields | | + | unused | | | + +----------------+ | | + | | | | + +--------------+ | | + | | | | + +--------------+ | | + | | | | + +--------------+ | | + n_keys = n_fields - n_vals | | + +The hist_data n_vals and n_fields delineate the extent of the fields[] | | +array and separate keys from values for the rest of the code. | | + +Below is a run-time representation of the tracing_map part of the | | +histogram, with pointers from various parts of the fields[] array | | +to corresponding parts of the tracing_map. | | + +The tracing_map consists of an array of tracing_map_entrys and a set | | +of preallocated tracing_map_elts (abbreviated below as map_entry and | | +map_elt). The total number of map_entrys in the hist_data.map array = | | +map->max_elts (actually map->map_size but only max_elts of those are | | +used. This is a property required by the map_insert() algorithm). | | + +If a map_entry is unused, meaning no key has yet hashed into it, its | | +.key value is 0 and its .val pointer is NULL. Once a map_entry has | | +been claimed, the .key value contains the key's hash value and the | | +.val member points to a map_elt containing the full key and an entry | | +for each key or value in the map_elt.fields[] array. There is an | | +entry in the map_elt.fields[] array corresponding to each hist_field | | +in the histogram, and this is where the continually aggregated sums | | +corresponding to each histogram value are kept. | | + +The diagram attempts to show the relationship between the | | +hist_data.fields[] and the map_elt.fields[] with the links drawn | | +between diagrams:: + + +-----------+ | | + | hist_data | | | + +-----------+ | | + | .fields | | | + +---------+ +-----------+ | | + | .map |---->| map_entry | | | + +---------+ +-----------+ | | + | .key |---> 0 | | + +---------+ | | + | .val |---> NULL | | + +-----------+ | | + | map_entry | | | + +-----------+ | | + | .key |---> pid = 999 | | + +---------+ +-----------+ | | + | .val |--->| map_elt | | | + +---------+ +-----------+ | | + . | .key |---> full key * | | + . +---------+ +---------------+ | | + . | .fields |--->| .sum (val) |<-+ | + +-----------+ +---------+ | 2345 | | | + | map_entry | +---------------+ | | + +-----------+ | .offset (key) |<----+ + | .key |---> 0 | 0 | | | + +---------+ +---------------+ | | + | .val |---> NULL . | | + +-----------+ . | | + | map_entry | . | | + +-----------+ +---------------+ | | + | .key | | .sum (val) or | | | + +---------+ +---------+ | .offset (key) | | | + | .val |--->| map_elt | +---------------+ | | + +-----------+ +---------+ | .sum (val) or | | | + | map_entry | | .offset (key) | | | + +-----------+ +---------------+ | | + | .key |---> pid = 4444 | | + +---------+ +-----------+ | | + | .val | | map_elt | | | + +---------+ +-----------+ | | + | .key |---> full key * | | + +---------+ +---------------+ | | + | .fields |--->| .sum (val) |<-+ | + +---------+ | 65523 | | + +---------------+ | + | .offset (key) |<----+ + | 0 | + +---------------+ + . + . + . + +---------------+ + | .sum (val) or | + | .offset (key) | + +---------------+ + | .sum (val) or | + | .offset (key) | + +---------------+ + +Abbreviations used in the diagrams:: + + hist_data = struct hist_trigger_data + hist_data.fields = struct hist_field + fn = hist_field_fn_t + map_entry = struct tracing_map_entry + map_elt = struct tracing_map_elt + map_elt.fields = struct tracing_map_field + +Whenever a new event occurs and it has a hist trigger associated with +it, event_hist_trigger() is called. event_hist_trigger() first deals +with the key: for each subkey in the key (in the above example, there +is just one subkey corresponding to pid), the hist_field that +represents that subkey is retrieved from hist_data.fields[] and the +hist_field_fn_t fn() associated with that field, along with the +field's size and offset, is used to grab that subkey's data from the +current trace record. + +Once the complete key has been retrieved, it's used to look that key +up in the tracing_map. If there's no tracing_map_elt associated with +that key, an empty one is claimed and inserted in the map for the new +key. In either case, the tracing_map_elt associated with that key is +returned. + +Once a tracing_map_elt available, hist_trigger_elt_update() is called. +As the name implies, this updates the element, which basically means +updating the element's fields. There's a tracing_map_field associated +with each key and value in the histogram, and each of these correspond +to the key and value hist_fields created when the histogram was +created. hist_trigger_elt_update() goes through each value hist_field +and, as for the keys, uses the hist_field's fn() and size and offset +to grab the field's value from the current trace record. Once it has +that value, it simply adds that value to that field's +continually-updated tracing_map_field.sum member. Some hist_field +fn()s, such as for the hitcount, don't actually grab anything from the +trace record (the hitcount fn() just increments the counter sum by 1), +but the idea is the same. + +Once all the values have been updated, hist_trigger_elt_update() is +done and returns. Note that there are also tracing_map_fields for +each subkey in the key, but hist_trigger_elt_update() doesn't look at +them or update anything - those exist only for sorting, which can +happen later. + +Basic histogram test +-------------------- + +This is a good example to try. It produces 3 value fields and 2 key +fields in the output:: + + # echo 'hist:keys=common_pid,call_site.sym:values=bytes_req,bytes_alloc,hitcount' >> events/kmem/kmalloc/trigger + +To see the debug data, cat the kmem/kmalloc's 'hist_debug' file. It +will show the trigger info of the histogram it corresponds to, along +with the address of the hist_data associated with the histogram, which +will become useful in later examples. It then displays the number of +total hist_fields associated with the histogram along with a count of +how many of those correspond to keys and how many correspond to values. + +It then goes on to display details for each field, including the +field's flags and the position of each field in the hist_data's +fields[] array, which is useful information for verifying that things +internally appear correct or not, and which again will become even +more useful in further examples:: + + # cat events/kmem/kmalloc/hist_debug + + # event histogram + # + # trigger info: hist:keys=common_pid,call_site.sym:vals=hitcount,bytes_req,bytes_alloc:sort=hitcount:size=2048 [active] + # + + hist_data: 000000005e48c9a5 + + n_vals: 3 + n_keys: 2 + n_fields: 5 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + VAL: normal u64 value + ftrace_event_field name: bytes_req + type: size_t + size: 8 + is_signed: 0 + + hist_data->fields[2]: + flags: + VAL: normal u64 value + ftrace_event_field name: bytes_alloc + type: size_t + size: 8 + is_signed: 0 + + key fields: + + hist_data->fields[3]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: common_pid + type: int + size: 8 + is_signed: 1 + + hist_data->fields[4]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: call_site + type: unsigned long + size: 8 + is_signed: 0 + +The commands below can be used to clean things up for the next test:: + + # echo '!hist:keys=common_pid,call_site.sym:values=bytes_req,bytes_alloc,hitcount' >> events/kmem/kmalloc/trigger + +Variables +========= + +Variables allow data from one hist trigger to be saved by one hist +trigger and retrieved by another hist trigger. For example, a trigger +on the sched_waking event can capture a timestamp for a particular +pid, and later a sched_switch event that switches to that pid event +can grab the timestamp and use it to calculate a time delta between +the two events:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> + events/sched/sched_waking/trigger + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0' >> + events/sched/sched_switch/trigger + +In terms of the histogram data structures, variables are implemented +as another type of hist_field and for a given hist trigger are added +to the hist_data.fields[] array just after all the val fields. To +distinguish them from the existing key and val fields, they're given a +new flag type, HIST_FIELD_FL_VAR (abbreviated FL_VAR) and they also +make use of a new .var.idx field member in struct hist_field, which +maps them to an index in a new map_elt.vars[] array added to the +map_elt specifically designed to store and retrieve variable values. +The diagram below shows those new elements and adds a new variable +entry, ts0, corresponding to the ts0 variable in the sched_waking +trigger above. + +sched_waking histogram +----------------------:: + + +------------------+ + | hist_data |<-------------------------------------------------------+ + +------------------+ +-------------------+ | + | .fields[] |-->| val = hitcount | | + +----------------+ +-------------------+ | + | .map | | .size | | + +----------------+ +-----------------+ | + | .offset | | + +-----------------+ | + | .fn() | | + +-----------------+ | + | .flags | | + +-----------------+ | + | .var.idx | | + +-------------------+ | + | var = ts0 | | + +-------------------+ | + | .size | | + +-----------------+ | + | .offset | | + +-----------------+ | + | .fn() | | + +-----------------+ | + | .flags & FL_VAR | | + +-----------------+ | + | .var.idx |----------------------------+-+ | + +-----------------+ | | | + . | | | + . | | | + . | | | + +-------------------+ <--- n_vals | | | + | key = pid | | | | + +-------------------+ | | | + | .size | | | | + +-----------------+ | | | + | .offset | | | | + +-----------------+ | | | + | .fn() | | | | + +-----------------+ | | | + | .flags & FL_KEY | | | | + +-----------------+ | | | + | .var.idx | | | | + +-------------------+ <--- n_fields | | | + | unused | | | | + +-------------------+ | | | + | | | | | + +-----------------+ | | | + | | | | | + +-----------------+ | | | + | | | | | + +-----------------+ | | | + | | | | | + +-----------------+ | | | + | | | | | + +-----------------+ | | | + n_keys = n_fields - n_vals | | | + | | | + +This is very similar to the basic case. In the above diagram, we can | | | +see a new .flags member has been added to the struct hist_field | | | +struct, and a new entry added to hist_data.fields representing the ts0 | | | +variable. For a normal val hist_field, .flags is just 0 (modulo | | | +modifier flags), but if the value is defined as a variable, the .flags | | | +contains a set FL_VAR bit. | | | + +As you can see, the ts0 entry's .var.idx member contains the index | | | +into the tracing_map_elts' .vars[] array containing variable values. | | | +This idx is used whenever the value of the variable is set or read. | | | +The map_elt.vars idx assigned to the given variable is assigned and | | | +saved in .var.idx by create_tracing_map_fields() after it calls | | | +tracing_map_add_var(). | | | + +Below is a representation of the histogram at run-time, which | | | +populates the map, along with correspondence to the above hist_data and | | | +hist_field data structures. | | | + +The diagram attempts to show the relationship between the | | | +hist_data.fields[] and the map_elt.fields[] and map_elt.vars[] with | | | +the links drawn between diagrams. For each of the map_elts, you can | | | +see that the .fields[] members point to the .sum or .offset of a key | | | +or val and the .vars[] members point to the value of a variable. The | | | +arrows between the two diagrams show the linkages between those | | | +tracing_map members and the field definitions in the corresponding | | | +hist_data fields[] members.:: + + +-----------+ | | | + | hist_data | | | | + +-----------+ | | | + | .fields | | | | + +---------+ +-----------+ | | | + | .map |---->| map_entry | | | | + +---------+ +-----------+ | | | + | .key |---> 0 | | | + +---------+ | | | + | .val |---> NULL | | | + +-----------+ | | | + | map_entry | | | | + +-----------+ | | | + | .key |---> pid = 999 | | | + +---------+ +-----------+ | | | + | .val |--->| map_elt | | | | + +---------+ +-----------+ | | | + . | .key |---> full key * | | | + . +---------+ +---------------+ | | | + . | .fields |--->| .sum (val) | | | | + . +---------+ | 2345 | | | | + . +--| .vars | +---------------+ | | | + . | +---------+ | .offset (key) | | | | + . | | 0 | | | | + . | +---------------+ | | | + . | . | | | + . | . | | | + . | . | | | + . | +---------------+ | | | + . | | .sum (val) or | | | | + . | | .offset (key) | | | | + . | +---------------+ | | | + . | | .sum (val) or | | | | + . | | .offset (key) | | | | + . | +---------------+ | | | + . | | | | + . +---------------->+---------------+ | | | + . | ts0 |<--+ | | + . | 113345679876 | | | | + . +---------------+ | | | + . | unused | | | | + . | | | | | + . +---------------+ | | | + . . | | | + . . | | | + . . | | | + . +---------------+ | | | + . | unused | | | | + . | | | | | + . +---------------+ | | | + . | unused | | | | + . | | | | | + . +---------------+ | | | + . | | | + +-----------+ | | | + | map_entry | | | | + +-----------+ | | | + | .key |---> pid = 4444 | | | + +---------+ +-----------+ | | | + | .val |--->| map_elt | | | | + +---------+ +-----------+ | | | + . | .key |---> full key * | | | + . +---------+ +---------------+ | | | + . | .fields |--->| .sum (val) | | | | + +---------+ | 2345 | | | | + +--| .vars | +---------------+ | | | + | +---------+ | .offset (key) | | | | + | | 0 | | | | + | +---------------+ | | | + | . | | | + | . | | | + | . | | | + | +---------------+ | | | + | | .sum (val) or | | | | + | | .offset (key) | | | | + | +---------------+ | | | + | | .sum (val) or | | | | + | | .offset (key) | | | | + | +---------------+ | | | + | | | | + | +---------------+ | | | + +---------------->| ts0 |<--+ | | + | 213499240729 | | | + +---------------+ | | + | unused | | | + | | | | + +---------------+ | | + . | | + . | | + . | | + +---------------+ | | + | unused | | | + | | | | + +---------------+ | | + | unused | | | + | | | | + +---------------+ | | + +For each used map entry, there's a map_elt pointing to an array of | | +.vars containing the current value of the variables associated with | | +that histogram entry. So in the above, the timestamp associated with | | +pid 999 is 113345679876, and the timestamp variable in the same | | +.var.idx for pid 4444 is 213499240729. | | + +sched_switch histogram | | +---------------------- | | + +The sched_switch histogram paired with the above sched_waking | | +histogram is shown below. The most important aspect of the | | +sched_switch histogram is that it references a variable on the | | +sched_waking histogram above. | | + +The histogram diagram is very similar to the others so far displayed, | | +but it adds variable references. You can see the normal hitcount and | | +key fields along with a new wakeup_lat variable implemented in the | | +same way as the sched_waking ts0 variable, but in addition there's an | | +entry with the new FL_VAR_REF (short for HIST_FIELD_FL_VAR_REF) flag. | | + +Associated with the new var ref field are a couple of new hist_field | | +members, var.hist_data and var_ref_idx. For a variable reference, the | | +var.hist_data goes with the var.idx, which together uniquely identify | | +a particular variable on a particular histogram. The var_ref_idx is | | +just the index into the var_ref_vals[] array that caches the values of | | +each variable whenever a hist trigger is updated. Those resulting | | +values are then finally accessed by other code such as trace action | | +code that uses the var_ref_idx values to assign param values. | | + +The diagram below describes the situation for the sched_switch | | +histogram referred to before:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0' >> | | + events/sched/sched_switch/trigger | | + | | + +------------------+ | | + | hist_data | | | + +------------------+ +-----------------------+ | | + | .fields[] |-->| val = hitcount | | | + +----------------+ +-----------------------+ | | + | .map | | .size | | | + +----------------+ +---------------------+ | | + +--| .var_refs[] | | .offset | | | + | +----------------+ +---------------------+ | | + | | .fn() | | | + | var_ref_vals[] +---------------------+ | | + | +-------------+ | .flags | | | + | | $ts0 |<---+ +---------------------+ | | + | +-------------+ | | .var.idx | | | + | | | | +---------------------+ | | + | +-------------+ | | .var.hist_data | | | + | | | | +---------------------+ | | + | +-------------+ | | .var_ref_idx | | | + | | | | +-----------------------+ | | + | +-------------+ | | var = wakeup_lat | | | + | . | +-----------------------+ | | + | . | | .size | | | + | . | +---------------------+ | | + | +-------------+ | | .offset | | | + | | | | +---------------------+ | | + | +-------------+ | | .fn() | | | + | | | | +---------------------+ | | + | +-------------+ | | .flags & FL_VAR | | | + | | +---------------------+ | | + | | | .var.idx | | | + | | +---------------------+ | | + | | | .var.hist_data | | | + | | +---------------------+ | | + | | | .var_ref_idx | | | + | | +---------------------+ | | + | | . | | + | | . | | + | | . | | + | | +-----------------------+ <--- n_vals | | + | | | key = pid | | | + | | +-----------------------+ | | + | | | .size | | | + | | +---------------------+ | | + | | | .offset | | | + | | +---------------------+ | | + | | | .fn() | | | + | | +---------------------+ | | + | | | .flags | | | + | | +---------------------+ | | + | | | .var.idx | | | + | | +-----------------------+ <--- n_fields | | + | | | unused | | | + | | +-----------------------+ | | + | | | | | | + | | +---------------------+ | | + | | | | | | + | | +---------------------+ | | + | | | | | | + | | +---------------------+ | | + | | | | | | + | | +---------------------+ | | + | | | | | | + | | +---------------------+ | | + | | n_keys = n_fields - n_vals | | + | | | | + | | | | + | | +-----------------------+ | | + +---------------------->| var_ref = $ts0 | | | + | +-----------------------+ | | + | | .size | | | + | +---------------------+ | | + | | .offset | | | + | +---------------------+ | | + | | .fn() | | | + | +---------------------+ | | + | | .flags & FL_VAR_REF | | | + | +---------------------+ | | + | | .var.idx |--------------------------+ | + | +---------------------+ | + | | .var.hist_data |----------------------------+ + | +---------------------+ + +---| .var_ref_idx | + +---------------------+ + +Abbreviations used in the diagrams:: + + hist_data = struct hist_trigger_data + hist_data.fields = struct hist_field + fn = hist_field_fn_t + FL_KEY = HIST_FIELD_FL_KEY + FL_VAR = HIST_FIELD_FL_VAR + FL_VAR_REF = HIST_FIELD_FL_VAR_REF + +When a hist trigger makes use of a variable, a new hist_field is +created with flag HIST_FIELD_FL_VAR_REF. For a VAR_REF field, the +var.idx and var.hist_data take the same values as the referenced +variable, as well as the referenced variable's size, type, and +is_signed values. The VAR_REF field's .name is set to the name of the +variable it references. If a variable reference was created using the +explicit system.event.$var_ref notation, the hist_field's system and +event_name variables are also set. + +So, in order to handle an event for the sched_switch histogram, +because we have a reference to a variable on another histogram, we +need to resolve all variable references first. This is done via the +resolve_var_refs() calls made from event_hist_trigger(). What this +does is grabs the var_refs[] array from the hist_data representing the +sched_switch histogram. For each one of those, the referenced +variable's var.hist_data along with the current key is used to look up +the corresponding tracing_map_elt in that histogram. Once found, the +referenced variable's var.idx is used to look up the variable's value +using tracing_map_read_var(elt, var.idx), which yields the value of +the variable for that element, ts0 in the case above. Note that both +the hist_fields representing both the variable and the variable +reference have the same var.idx, so this is straightforward. + +Variable and variable reference test +------------------------------------ + +This example creates a variable on the sched_waking event, ts0, and +uses it in the sched_switch trigger. The sched_switch trigger also +creates its own variable, wakeup_lat, but nothing yet uses it:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0' >> events/sched/sched_switch/trigger + +Looking at the sched_waking 'hist_debug' output, in addition to the +normal key and value hist_fields, in the val fields section we see a +field with the HIST_FIELD_FL_VAR flag, which indicates that that field +represents a variable. Note that in addition to the variable name, +contained in the var.name field, it includes the var.idx, which is the +index into the tracing_map_elt.vars[] array of the actual variable +location. Note also that the output shows that variables live in the +same part of the hist_data->fields[] array as normal values:: + + # cat events/sched/sched_waking/hist_debug + + # event histogram + # + # trigger info: hist:keys=pid:vals=hitcount:ts0=common_timestamp.usecs:sort=hitcount:size=2048:clock=global [active] + # + + hist_data: 000000009536f554 + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 8 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: pid + type: pid_t + size: 8 + is_signed: 1 + +Moving on to the sched_switch trigger hist_debug output, in addition +to the unused wakeup_lat variable, we see a new section displaying +variable references. Variable references are displayed in a separate +section because in addition to to being logically separate from +variables and values, they actually live in a separate hist_data +array, var_refs[]. + +In this example, the sched_switch trigger has a reference to a +variable on the sched_waking trigger, $ts0. Looking at the details, +we can see that the var.hist_data value of the referenced variable +matches the previously displayed sched_waking trigger, and the var.idx +value matches the previously displayed var.idx value for that +variable. Also displayed is the var_ref_idx value for that variable +reference, which is where the value for that variable is cached for +use when the trigger is invoked:: + + # cat events/sched/sched_switch/hist_debug + + # event histogram + # + # trigger info: hist:keys=next_pid:vals=hitcount:wakeup_lat=common_timestamp.usecs-$ts0:sort=hitcount:size=2048:clock=global [active] + # + + hist_data: 00000000f4ee8006 + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 0 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: next_pid + type: pid_t + size: 8 + is_signed: 1 + + variable reference fields: + + hist_data->var_refs[0]: + flags: + HIST_FIELD_FL_VAR_REF + name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 000000009536f554 + var_ref_idx (into hist_data->var_refs[]): 0 + type: u64 + size: 8 + is_signed: 0 + +The commands below can be used to clean things up for the next test:: + + # echo '!hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0' >> events/sched/sched_switch/trigger + + # echo '!hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + +Actions and Handlers +==================== + +Adding onto the previous example, we will now do something with that +wakeup_lat variable, namely send it and another field as a synthetic +event. + +The onmatch() action below basically says that whenever we have a +sched_switch event, if we have a matching sched_waking event, in this +case if we have a pid in the sched_waking histogram that matches the +the next_pid field on this sched_switch event, we retrieve the +variables specified in the wakeup_latency() trace action, and use +them to generate a new wakeup_latency event into the trace stream. + +Note that the way the trace handlers such as wakeup_latency() (which +could equivalently be written trace(wakeup_latency,$wakeup_lat,next_pid) +are implemented, the parameters specified to the trace handler must be +variables. In this case, $wakeup_lat is obviously a variable, but +next_pid isn't, since it's just naming a field in the sched_switch +trace event. Since this is something that almost every trace() and +save() action does, a special shortcut is implemented to allow field +names to be used directly in those cases. How it works is that under +the covers, a temporary variable is created for the named field, and +this variable is what is actually passed to the trace handler. In the +code and documentation, this type of variable is called a 'field +variable'. + +Fields on other trace event's histograms can be used as well. In that +case we have to generate a new histogram and an unfortunately named +'synthetic_field' (the use of synthetic here has nothing to do with +synthetic events) and use that special histogram field as a variable. + +The diagram below illustrates the new elements described above in the +context of the sched_switch histogram using the onmatch() handler and +the trace() action. + +First, we define the wakeup_latency synthetic event:: + + # echo 'wakeup_latency u64 lat; pid_t pid' >> synthetic_events + +Next, the sched_waking hist trigger as before:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> + events/sched/sched_waking/trigger + +Finally, we create a hist trigger on the sched_switch event that +generates a wakeup_latency() trace event. In this case we pass +next_pid into the wakeup_latency synthetic event invocation, which +means it will be automatically converted into a field variable:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0: \ + onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid)' >> + /sys/kernel/debug/tracing/events/sched/sched_switch/trigger + +The diagram for the sched_switch event is similar to previous examples +but shows the additional field_vars[] array for hist_data and shows +the linkages between the field_vars and the variables and references +created to implement the field variables. The details are discussed +below:: + + +------------------+ + | hist_data | + +------------------+ +-----------------------+ + | .fields[] |-->| val = hitcount | + +----------------+ +-----------------------+ + | .map | | .size | + +----------------+ +---------------------+ + +---| .field_vars[] | | .offset | + | +----------------+ +---------------------+ + |+--| .var_refs[] | | .offset | + || +----------------+ +---------------------+ + || | .fn() | + || var_ref_vals[] +---------------------+ + || +-------------+ | .flags | + || | $ts0 |<---+ +---------------------+ + || +-------------+ | | .var.idx | + || | $next_pid |<-+ | +---------------------+ + || +-------------+ | | | .var.hist_data | + ||+>| $wakeup_lat | | | +---------------------+ + ||| +-------------+ | | | .var_ref_idx | + ||| | | | | +-----------------------+ + ||| +-------------+ | | | var = wakeup_lat | + ||| . | | +-----------------------+ + ||| . | | | .size | + ||| . | | +---------------------+ + ||| +-------------+ | | | .offset | + ||| | | | | +---------------------+ + ||| +-------------+ | | | .fn() | + ||| | | | | +---------------------+ + ||| +-------------+ | | | .flags & FL_VAR | + ||| | | +---------------------+ + ||| | | | .var.idx | + ||| | | +---------------------+ + ||| | | | .var.hist_data | + ||| | | +---------------------+ + ||| | | | .var_ref_idx | + ||| | | +---------------------+ + ||| | | . + ||| | | . + ||| | | . + ||| | | . + ||| +--------------+ | | . + +-->| field_var | | | . + || +--------------+ | | . + || | var | | | . + || +------------+ | | . + || | val | | | . + || +--------------+ | | . + || | field_var | | | . + || +--------------+ | | . + || | var | | | . + || +------------+ | | . + || | val | | | . + || +------------+ | | . + || . | | . + || . | | . + || . | | +-----------------------+ <--- n_vals + || +--------------+ | | | key = pid | + || | field_var | | | +-----------------------+ + || +--------------+ | | | .size | + || | var |--+| +---------------------+ + || +------------+ ||| | .offset | + || | val |-+|| +---------------------+ + || +------------+ ||| | .fn() | + || ||| +---------------------+ + || ||| | .flags | + || ||| +---------------------+ + || ||| | .var.idx | + || ||| +---------------------+ <--- n_fields + || ||| + || ||| n_keys = n_fields - n_vals + || ||| +-----------------------+ + || |+->| var = next_pid | + || | | +-----------------------+ + || | | | .size | + || | | +---------------------+ + || | | | .offset | + || | | +---------------------+ + || | | | .flags & FL_VAR | + || | | +---------------------+ + || | | | .var.idx | + || | | +---------------------+ + || | | | .var.hist_data | + || | | +-----------------------+ + || +-->| val for next_pid | + || | | +-----------------------+ + || | | | .size | + || | | +---------------------+ + || | | | .offset | + || | | +---------------------+ + || | | | .fn() | + || | | +---------------------+ + || | | | .flags | + || | | +---------------------+ + || | | | | + || | | +---------------------+ + || | | + || | | + || | | +-----------------------+ + +|------------------|-|>| var_ref = $ts0 | + | | | +-----------------------+ + | | | | .size | + | | | +---------------------+ + | | | | .offset | + | | | +---------------------+ + | | | | .fn() | + | | | +---------------------+ + | | | | .flags & FL_VAR_REF | + | | | +---------------------+ + | | +---| .var_ref_idx | + | | +-----------------------+ + | | | var_ref = $next_pid | + | | +-----------------------+ + | | | .size | + | | +---------------------+ + | | | .offset | + | | +---------------------+ + | | | .fn() | + | | +---------------------+ + | | | .flags & FL_VAR_REF | + | | +---------------------+ + | +-----| .var_ref_idx | + | +-----------------------+ + | | var_ref = $wakeup_lat | + | +-----------------------+ + | | .size | + | +---------------------+ + | | .offset | + | +---------------------+ + | | .fn() | + | +---------------------+ + | | .flags & FL_VAR_REF | + | +---------------------+ + +------------------------| .var_ref_idx | + +---------------------+ + +As you can see, for a field variable, two hist_fields are created: one +representing the variable, in this case next_pid, and one to actually +get the value of the field from the trace stream, like a normal val +field does. These are created separately from normal variable +creation and are saved in the hist_data->field_vars[] array. See +below for how these are used. In addition, a reference hist_field is +also created, which is needed to reference the field variables such as +$next_pid variable in the trace() action. + +Note that $wakeup_lat is also a variable reference, referencing the +value of the expression common_timestamp-$ts0, and so also needs to +have a hist field entry representing that reference created. + +When hist_trigger_elt_update() is called to get the normal key and +value fields, it also calls update_field_vars(), which goes through +each field_var created for the histogram, and available from +hist_data->field_vars and calls val->fn() to get the data from the +current trace record, and then uses the var's var.idx to set the +variable at the var.idx offset in the appropriate tracing_map_elt's +variable at elt->vars[var.idx]. + +Once all the variables have been updated, resolve_var_refs() can be +called from event_hist_trigger(), and not only can our $ts0 and +$next_pid references be resolved but the $wakeup_lat reference as +well. At this point, the trace() action can simply access the values +assembled in the var_ref_vals[] array and generate the trace event. + +The same process occurs for the field variables associated with the +save() action. + +Abbreviations used in the diagram:: + + hist_data = struct hist_trigger_data + hist_data.fields = struct hist_field + field_var = struct field_var + fn = hist_field_fn_t + FL_KEY = HIST_FIELD_FL_KEY + FL_VAR = HIST_FIELD_FL_VAR + FL_VAR_REF = HIST_FIELD_FL_VAR_REF + +trace() action field variable test +---------------------------------- + +This example adds to the previous test example by finally making use +of the wakeup_lat variable, but in addition also creates a couple of +field variables that then are all passed to the wakeup_latency() trace +action via the onmatch() handler. + +First, we create the wakeup_latency synthetic event:: + + # echo 'wakeup_latency u64 lat; pid_t pid; char comm[16]' >> synthetic_events + +Next, the sched_waking trigger from previous examples:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + +Finally, as in the previous test example, we calculate and assign the +wakeup latency using the $ts0 reference from the sched_waking trigger +to the wakeup_lat variable, and finally use it along with a couple +sched_switch event fields, next_pid and next_comm, to generate a +wakeup_latency trace event. The next_pid and next_comm event fields +are automatically converted into field variables for this purpose:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,next_comm)' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger + +The sched_waking hist_debug output shows the same data as in the +previous test example:: + + # cat events/sched/sched_waking/hist_debug + + # event histogram + # + # trigger info: hist:keys=pid:vals=hitcount:ts0=common_timestamp.usecs:sort=hitcount:size=2048:clock=global [active] + # + + hist_data: 00000000d60ff61f + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 8 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: pid + type: pid_t + size: 8 + is_signed: 1 + +The sched_switch hist_debug output shows the same key and value fields +as in the previous test example - note that wakeup_lat is still in the +val fields section, but that the new field variables are not there - +although the field variables are variables, they're held separately in +the hist_data's field_vars[] array. Although the field variables and +the normal variables are located in separate places, you can see that +the actual variable locations for those variables in the +tracing_map_elt.vars[] do have increasing indices as expected: +wakeup_lat takes the var.idx = 0 slot, while the field variables for +next_pid and next_comm have values var.idx = 1, and var.idx = 2. Note +also that those are the same values displayed for the variable +references corresponding to those variables in the variable reference +fields section. Since there are two triggers and thus two hist_data +addresses, those addresses also need to be accounted for when doing +the matching - you can see that the first variable refers to the 0 +var.idx on the previous hist trigger (see the hist_data address +associated with that trigger), while the second variable refers to the +0 var.idx on the sched_switch hist trigger, as do all the remaining +variable references. + +Finally, the action tracking variables section just shows the system +and event name for the onmatch() handler:: + + # cat events/sched/sched_switch/hist_debug + + # event histogram + # + # trigger info: hist:keys=next_pid:vals=hitcount:wakeup_lat=common_timestamp.usecs-$ts0:sort=hitcount:size=2048:clock=global:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,next_comm) [active] + # + + hist_data: 0000000008f551b7 + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 0 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: next_pid + type: pid_t + size: 8 + is_signed: 1 + + variable reference fields: + + hist_data->var_refs[0]: + flags: + HIST_FIELD_FL_VAR_REF + name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 00000000d60ff61f + var_ref_idx (into hist_data->var_refs[]): 0 + type: u64 + size: 8 + is_signed: 0 + + hist_data->var_refs[1]: + flags: + HIST_FIELD_FL_VAR_REF + name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 0000000008f551b7 + var_ref_idx (into hist_data->var_refs[]): 1 + type: u64 + size: 0 + is_signed: 0 + + hist_data->var_refs[2]: + flags: + HIST_FIELD_FL_VAR_REF + name: next_pid + var.idx (into tracing_map_elt.vars[]): 1 + var.hist_data: 0000000008f551b7 + var_ref_idx (into hist_data->var_refs[]): 2 + type: pid_t + size: 4 + is_signed: 0 + + hist_data->var_refs[3]: + flags: + HIST_FIELD_FL_VAR_REF + name: next_comm + var.idx (into tracing_map_elt.vars[]): 2 + var.hist_data: 0000000008f551b7 + var_ref_idx (into hist_data->var_refs[]): 3 + type: char[16] + size: 256 + is_signed: 0 + + field variables: + + hist_data->field_vars[0]: + + field_vars[0].var: + flags: + HIST_FIELD_FL_VAR + var.name: next_pid + var.idx (into tracing_map_elt.vars[]): 1 + + field_vars[0].val: + ftrace_event_field name: next_pid + type: pid_t + size: 4 + is_signed: 1 + + hist_data->field_vars[1]: + + field_vars[1].var: + flags: + HIST_FIELD_FL_VAR + var.name: next_comm + var.idx (into tracing_map_elt.vars[]): 2 + + field_vars[1].val: + ftrace_event_field name: next_comm + type: char[16] + size: 256 + is_signed: 0 + + action tracking variables (for onmax()/onchange()/onmatch()): + + hist_data->actions[0].match_data.event_system: sched + hist_data->actions[0].match_data.event: sched_waking + +The commands below can be used to clean things up for the next test:: + + # echo '!hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,next_comm)' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger + + # echo '!hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + + # echo '!wakeup_latency u64 lat; pid_t pid; char comm[16]' >> synthetic_events + +action_data and the trace() action +---------------------------------- + +As mentioned above, when the trace() action generates a synthetic +event, all the parameters to the synthetic event either already are +variables or are converted into variables (via field variables), and +finally all those variable values are collected via references to them +into a var_ref_vals[] array. + +The values in the var_ref_vals[] array, however, don't necessarily +follow the same ordering as the synthetic event params. To address +that, struct action_data contains another array, var_ref_idx[] that +maps the trace action params to the var_ref_vals[] values. Below is a +diagram illustrating that for the wakeup_latency() synthetic event:: + + +------------------+ wakeup_latency() + | action_data | event params var_ref_vals[] + +------------------+ +-----------------+ +-----------------+ + | .var_ref_idx[] |--->| $wakeup_lat idx |---+ | | + +----------------+ +-----------------+ | +-----------------+ + | .synth_event | | $next_pid idx |---|-+ | $wakeup_lat val | + +----------------+ +-----------------+ | | +-----------------+ + . | +->| $next_pid val | + . | +-----------------+ + . | . + +-----------------+ | . + | | | . + +-----------------+ | +-----------------+ + +--->| $wakeup_lat val | + +-----------------+ + +Basically, how this ends up getting used in the synthetic event probe +function, trace_event_raw_event_synth(), is as follows:: + + for each field i in .synth_event + val_idx = .var_ref_idx[i] + val = var_ref_vals[val_idx] + +action_data and the onXXX() handlers +------------------------------------ + +The hist trigger onXXX() actions other than onmatch(), such as onmax() +and onchange(), also make use of and internally create hidden +variables. This information is contained in the +action_data.track_data struct, and is also visible in the hist_debug +output as will be described in the example below. + +Typically, the onmax() or onchange() handlers are used in conjunction +with the save() and snapshot() actions. For example:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0: \ + onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm)' >> + /sys/kernel/debug/tracing/events/sched/sched_switch/trigger + +or:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0: \ + onmax($wakeup_lat).snapshot()' >> + /sys/kernel/debug/tracing/events/sched/sched_switch/trigger + +save() action field variable test +--------------------------------- + +For this example, instead of generating a synthetic event, the save() +action is used to save field values whenever an onmax() handler +detects that a new max latency has been hit. As in the previous +example, the values being saved are also field values, but in this +case, are kept in a separate hist_data array named save_vars[]. + +As in previous test examples, we set up the sched_waking trigger:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + +In this case, however, we set up the sched_switch trigger to save some +sched_switch field values whenever we hit a new maximum latency. For +both the onmax() handler and save() action, variables will be created, +which we can use the hist_debug files to examine:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm)' >> events/sched/sched_switch/trigger + +The sched_waking hist_debug output shows the same data as in the +previous test examples:: + + # cat events/sched/sched_waking/hist_debug + + # + # trigger info: hist:keys=pid:vals=hitcount:ts0=common_timestamp.usecs:sort=hitcount:size=2048:clock=global [active] + # + + hist_data: 00000000e6290f48 + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 8 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: pid + type: pid_t + size: 8 + is_signed: 1 + +The output of the sched_switch trigger shows the same val and key +values as before, but also shows a couple new sections. + +First, the action tracking variables section now shows the +actions[].track_data information describing the special tracking +variables and references used to track, in this case, the running +maximum value. The actions[].track_data.var_ref member contains the +reference to the variable being tracked, in this case the $wakeup_lat +variable. In order to perform the onmax() handler function, there +also needs to be a variable that tracks the current maximum by getting +updated whenever a new maximum is hit. In this case, we can see that +an auto-generated variable named ' __max' has been created and is +visible in the actions[].track_data.track_var variable. + +Finally, in the new 'save action variables' section, we can see that +the 4 params to the save() function have resulted in 4 field variables +being created for the purposes of saving the values of the named +fields when the max is hit. These variables are kept in a separate +save_vars[] array off of hist_data, so are displayed in a separate +section:: + + # cat events/sched/sched_switch/hist_debug + + # event histogram + # + # trigger info: hist:keys=next_pid:vals=hitcount:wakeup_lat=common_timestamp.usecs-$ts0:sort=hitcount:size=2048:clock=global:onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) [active] + # + + hist_data: 0000000057bcd28d + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 0 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: next_pid + type: pid_t + size: 8 + is_signed: 1 + + variable reference fields: + + hist_data->var_refs[0]: + flags: + HIST_FIELD_FL_VAR_REF + name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 00000000e6290f48 + var_ref_idx (into hist_data->var_refs[]): 0 + type: u64 + size: 8 + is_signed: 0 + + hist_data->var_refs[1]: + flags: + HIST_FIELD_FL_VAR_REF + name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 0000000057bcd28d + var_ref_idx (into hist_data->var_refs[]): 1 + type: u64 + size: 0 + is_signed: 0 + + action tracking variables (for onmax()/onchange()/onmatch()): + + hist_data->actions[0].track_data.var_ref: + flags: + HIST_FIELD_FL_VAR_REF + name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 0000000057bcd28d + var_ref_idx (into hist_data->var_refs[]): 1 + type: u64 + size: 0 + is_signed: 0 + + hist_data->actions[0].track_data.track_var: + flags: + HIST_FIELD_FL_VAR + var.name: __max + var.idx (into tracing_map_elt.vars[]): 1 + type: u64 + size: 8 + is_signed: 0 + + save action variables (save() params): + + hist_data->save_vars[0]: + + save_vars[0].var: + flags: + HIST_FIELD_FL_VAR + var.name: next_comm + var.idx (into tracing_map_elt.vars[]): 2 + + save_vars[0].val: + ftrace_event_field name: next_comm + type: char[16] + size: 256 + is_signed: 0 + + hist_data->save_vars[1]: + + save_vars[1].var: + flags: + HIST_FIELD_FL_VAR + var.name: prev_pid + var.idx (into tracing_map_elt.vars[]): 3 + + save_vars[1].val: + ftrace_event_field name: prev_pid + type: pid_t + size: 4 + is_signed: 1 + + hist_data->save_vars[2]: + + save_vars[2].var: + flags: + HIST_FIELD_FL_VAR + var.name: prev_prio + var.idx (into tracing_map_elt.vars[]): 4 + + save_vars[2].val: + ftrace_event_field name: prev_prio + type: int + size: 4 + is_signed: 1 + + hist_data->save_vars[3]: + + save_vars[3].var: + flags: + HIST_FIELD_FL_VAR + var.name: prev_comm + var.idx (into tracing_map_elt.vars[]): 5 + + save_vars[3].val: + ftrace_event_field name: prev_comm + type: char[16] + size: 256 + is_signed: 0 + +The commands below can be used to clean things up for the next test:: + + # echo '!hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm)' >> events/sched/sched_switch/trigger + + # echo '!hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + +A couple special cases +====================== + +While the above covers the basics of the histogram internals, there +are a couple of special cases that should be discussed, since they +tend to create even more confusion. Those are field variables on other +histograms, and aliases, both described below through example tests +using the hist_debug files. + +Test of field variables on other histograms +------------------------------------------- + +This example is similar to the previous examples, but in this case, +the sched_switch trigger references a hist trigger field on another +event, namely the sched_waking event. In order to accomplish this, a +field variable is created for the other event, but since an existing +histogram can't be used, as existing histograms are immutable, a new +histogram with a matching variable is created and used, and we'll see +that reflected in the hist_debug output shown below. + +First, we create the wakeup_latency synthetic event. Note the +addition of the prio field:: + + # echo 'wakeup_latency u64 lat; pid_t pid; int prio' >> synthetic_events + +As in previous test examples, we set up the sched_waking trigger:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + +Here we set up a hist trigger on sched_switch to send a wakeup_latency +event using an onmatch handler naming the sched_waking event. Note +that the third param being passed to the wakeup_latency() is prio, +which is a field name that needs to have a field variable created for +it. There isn't however any prio field on the sched_switch event so +it would seem that it wouldn't be possible to create a field variable +for it. The matching sched_waking event does have a prio field, so it +should be possible to make use of it for this purpose. The problem +with that is that it's not currently possible to define a new variable +on an existing histogram, so it's not possible to add a new prio field +variable to the existing sched_waking histogram. It is however +possible to create an additional new 'matching' sched_waking histogram +for the same event, meaning that it uses the same key and filters, and +define the new prio field variable on that. + +Here's the sched_switch trigger:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,prio)' >> events/sched/sched_switch/trigger + +And here's the output of the hist_debug information for the +sched_waking hist trigger. Note that there are two histograms +displayed in the output: the first is the normal sched_waking +histogram we've seen in the previous examples, and the second is the +special histogram we created to provide the prio field variable. + +Looking at the second histogram below, we see a variable with the name +synthetic_prio. This is the field variable created for the prio field +on that sched_waking histogram:: + + # cat events/sched/sched_waking/hist_debug + + # event histogram + # + # trigger info: hist:keys=pid:vals=hitcount:ts0=common_timestamp.usecs:sort=hitcount:size=2048:clock=global [active] + # + + hist_data: 00000000349570e4 + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 8 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: pid + type: pid_t + size: 8 + is_signed: 1 + + + # event histogram + # + # trigger info: hist:keys=pid:vals=hitcount:synthetic_prio=prio:sort=hitcount:size=2048 [active] + # + + hist_data: 000000006920cf38 + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + ftrace_event_field name: prio + var.name: synthetic_prio + var.idx (into tracing_map_elt.vars[]): 0 + type: int + size: 4 + is_signed: 1 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: pid + type: pid_t + size: 8 + is_signed: 1 + +Looking at the sched_switch histogram below, we can see a reference to +the synthetic_prio variable on sched_waking, and looking at the +associated hist_data address we see that it is indeed associated with +the new histogram. Note also that the other references are to a +normal variable, wakeup_lat, and to a normal field variable, next_pid, +the details of which are in the field variables section:: + + # cat events/sched/sched_switch/hist_debug + + # event histogram + # + # trigger info: hist:keys=next_pid:vals=hitcount:wakeup_lat=common_timestamp.usecs-$ts0:sort=hitcount:size=2048:clock=global:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,prio) [active] + # + + hist_data: 00000000a73b67df + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 0 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: next_pid + type: pid_t + size: 8 + is_signed: 1 + + variable reference fields: + + hist_data->var_refs[0]: + flags: + HIST_FIELD_FL_VAR_REF + name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 00000000349570e4 + var_ref_idx (into hist_data->var_refs[]): 0 + type: u64 + size: 8 + is_signed: 0 + + hist_data->var_refs[1]: + flags: + HIST_FIELD_FL_VAR_REF + name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 00000000a73b67df + var_ref_idx (into hist_data->var_refs[]): 1 + type: u64 + size: 0 + is_signed: 0 + + hist_data->var_refs[2]: + flags: + HIST_FIELD_FL_VAR_REF + name: next_pid + var.idx (into tracing_map_elt.vars[]): 1 + var.hist_data: 00000000a73b67df + var_ref_idx (into hist_data->var_refs[]): 2 + type: pid_t + size: 4 + is_signed: 0 + + hist_data->var_refs[3]: + flags: + HIST_FIELD_FL_VAR_REF + name: synthetic_prio + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 000000006920cf38 + var_ref_idx (into hist_data->var_refs[]): 3 + type: int + size: 4 + is_signed: 1 + + field variables: + + hist_data->field_vars[0]: + + field_vars[0].var: + flags: + HIST_FIELD_FL_VAR + var.name: next_pid + var.idx (into tracing_map_elt.vars[]): 1 + + field_vars[0].val: + ftrace_event_field name: next_pid + type: pid_t + size: 4 + is_signed: 1 + + action tracking variables (for onmax()/onchange()/onmatch()): + + hist_data->actions[0].match_data.event_system: sched + hist_data->actions[0].match_data.event: sched_waking + +The commands below can be used to clean things up for the next test:: + + # echo '!hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,prio)' >> events/sched/sched_switch/trigger + + # echo '!hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + + # echo '!wakeup_latency u64 lat; pid_t pid; int prio' >> synthetic_events + +Alias test +---------- + +This example is very similar to previous examples, but demonstrates +the alias flag. + +First, we create the wakeup_latency synthetic event:: + + # echo 'wakeup_latency u64 lat; pid_t pid; char comm[16]' >> synthetic_events + +Next, we create a sched_waking trigger similar to previous examples, +but in this case we save the pid in the waking_pid variable:: + + # echo 'hist:keys=pid:waking_pid=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + +For the sched_switch trigger, instead of using $waking_pid directly in +the wakeup_latency synthetic event invocation, we create an alias of +$waking_pid named $woken_pid, and use that in the synthetic event +invocation instead:: + + # echo 'hist:keys=next_pid:woken_pid=$waking_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,$woken_pid,next_comm)' >> events/sched/sched_switch/trigger + +Looking at the sched_waking hist_debug output, in addition to the +normal fields, we can see the waking_pid variable:: + + # cat events/sched/sched_waking/hist_debug + + # event histogram + # + # trigger info: hist:keys=pid:vals=hitcount:waking_pid=pid,ts0=common_timestamp.usecs:sort=hitcount:size=2048:clock=global [active] + # + + hist_data: 00000000a250528c + + n_vals: 3 + n_keys: 1 + n_fields: 4 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + ftrace_event_field name: pid + var.name: waking_pid + var.idx (into tracing_map_elt.vars[]): 0 + type: pid_t + size: 4 + is_signed: 1 + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_VAR + var.name: ts0 + var.idx (into tracing_map_elt.vars[]): 1 + type: u64 + size: 8 + is_signed: 0 + + key fields: + + hist_data->fields[3]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: pid + type: pid_t + size: 8 + is_signed: 1 + +The sched_switch hist_debug output shows that a variable named +woken_pid has been created but that it also has the +HIST_FIELD_FL_ALIAS flag set. It also has the HIST_FIELD_FL_VAR flag +set, which is why it appears in the val field section. + +Despite that implementation detail, an alias variable is actually more +like a variable reference; in fact it can be thought of as a reference +to a reference. The implementation copies the var_ref->fn() from the +variable reference being referenced, in this case, the waking_pid +fn(), which is hist_field_var_ref() and makes that the fn() of the +alias. The hist_field_var_ref() fn() requires the var_ref_idx of the +variable reference it's using, so waking_pid's var_ref_idx is also +copied to the alias. The end result is that when the value of alias +is retrieved, in the end it just does the same thing the original +reference would have done and retrieves the same value from the +var_ref_vals[] array. You can verify this in the output by noting +that the var_ref_idx of the alias, in this case woken_pid, is the same +as the var_ref_idx of the reference, waking_pid, in the variable +reference fields section. + +Additionally, once it gets that value, since it is also a variable, it +then saves that value into its var.idx. So the var.idx of the +woken_pid alias is 0, which it fills with the value from var_ref_idx 0 +when its fn() is called to update itself. You'll also notice that +there's a woken_pid var_ref in the variable refs section. That is the +reference to the woken_pid alias variable, and you can see that it +retrieves the value from the same var.idx as the woken_pid alias, 0, +and then in turn saves that value in its own var_ref_idx slot, 3, and +the value at this position is finally what gets assigned to the +$woken_pid slot in the trace event invocation:: + + # cat events/sched/sched_switch/hist_debug + + # event histogram + # + # trigger info: hist:keys=next_pid:vals=hitcount:woken_pid=$waking_pid,wakeup_lat=common_timestamp.usecs-$ts0:sort=hitcount:size=2048:clock=global:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,$woken_pid,next_comm) [active] + # + + hist_data: 0000000055d65ed0 + + n_vals: 3 + n_keys: 1 + n_fields: 4 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + HIST_FIELD_FL_ALIAS + var.name: woken_pid + var.idx (into tracing_map_elt.vars[]): 0 + var_ref_idx (into hist_data->var_refs[]): 0 + type: pid_t + size: 4 + is_signed: 1 + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_VAR + var.name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 1 + type: u64 + size: 0 + is_signed: 0 + + key fields: + + hist_data->fields[3]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: next_pid + type: pid_t + size: 8 + is_signed: 1 + + variable reference fields: + + hist_data->var_refs[0]: + flags: + HIST_FIELD_FL_VAR_REF + name: waking_pid + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 00000000a250528c + var_ref_idx (into hist_data->var_refs[]): 0 + type: pid_t + size: 4 + is_signed: 1 + + hist_data->var_refs[1]: + flags: + HIST_FIELD_FL_VAR_REF + name: ts0 + var.idx (into tracing_map_elt.vars[]): 1 + var.hist_data: 00000000a250528c + var_ref_idx (into hist_data->var_refs[]): 1 + type: u64 + size: 8 + is_signed: 0 + + hist_data->var_refs[2]: + flags: + HIST_FIELD_FL_VAR_REF + name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 1 + var.hist_data: 0000000055d65ed0 + var_ref_idx (into hist_data->var_refs[]): 2 + type: u64 + size: 0 + is_signed: 0 + + hist_data->var_refs[3]: + flags: + HIST_FIELD_FL_VAR_REF + name: woken_pid + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 0000000055d65ed0 + var_ref_idx (into hist_data->var_refs[]): 3 + type: pid_t + size: 4 + is_signed: 1 + + hist_data->var_refs[4]: + flags: + HIST_FIELD_FL_VAR_REF + name: next_comm + var.idx (into tracing_map_elt.vars[]): 2 + var.hist_data: 0000000055d65ed0 + var_ref_idx (into hist_data->var_refs[]): 4 + type: char[16] + size: 256 + is_signed: 0 + + field variables: + + hist_data->field_vars[0]: + + field_vars[0].var: + flags: + HIST_FIELD_FL_VAR + var.name: next_comm + var.idx (into tracing_map_elt.vars[]): 2 + + field_vars[0].val: + ftrace_event_field name: next_comm + type: char[16] + size: 256 + is_signed: 0 + + action tracking variables (for onmax()/onchange()/onmatch()): + + hist_data->actions[0].match_data.event_system: sched + hist_data->actions[0].match_data.event: sched_waking + +The commands below can be used to clean things up for the next test:: + + # echo '!hist:keys=next_pid:woken_pid=$waking_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,$woken_pid,next_comm)' >> events/sched/sched_switch/trigger + + # echo '!hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + + # echo '!wakeup_latency u64 lat; pid_t pid; char comm[16]' >> synthetic_events diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 561969754bc08b6d33009c996db923dd7d1edac0..6f9e000757fa07e4f0647259f88e81b71afe8cdf 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -191,15 +191,15 @@ The usage pattern is:: again: range.notifier_seq = mmu_interval_read_begin(&interval_sub); - down_read(&mm->mmap_sem); + mmap_read_lock(mm); ret = hmm_range_fault(&range); if (ret) { - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); if (ret == -EBUSY) goto again; return ret; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); take_lock(driver->update); if (mmu_interval_read_retry(&ni, range.notifier_seq) { diff --git a/Documentation/vm/transhuge.rst b/Documentation/vm/transhuge.rst index 37c57ca326297a500a507e036bde0aeeaa47c0d2..0ed23e59abe5130865d86889f2cbdcd681cb4daf 100644 --- a/Documentation/vm/transhuge.rst +++ b/Documentation/vm/transhuge.rst @@ -98,9 +98,9 @@ split_huge_page() or split_huge_pmd() has a cost. To make pagetable walks huge pmd aware, all you need to do is to call pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the -mmap_sem in read (or write) mode to be sure a huge pmd cannot be +mmap_lock in read (or write) mode to be sure a huge pmd cannot be created from under you by khugepaged (khugepaged collapse_huge_page -takes the mmap_sem in write mode in addition to the anon_vma lock). If +takes the mmap_lock in write mode in addition to the anon_vma lock). If pmd_trans_huge returns false, you just fallback in the old code paths. If instead pmd_trans_huge returns true, you have to take the page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the diff --git a/MAINTAINERS b/MAINTAINERS index 5e7c42fecb2a1ebdea946b7a12727de65e0277db..456162183074b2dd90787608b0eccf9d8c5e5858 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7251,7 +7251,7 @@ L: cluster-devel@redhat.com S: Supported W: http://sources.redhat.com/cluster/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2.git -F: Documentation/filesystems/gfs2*.txt +F: Documentation/filesystems/gfs2* F: fs/gfs2/ F: include/uapi/linux/gfs2_ondisk.h @@ -8502,6 +8502,7 @@ F: drivers/mtd/nand/raw/ingenic/ F: drivers/pinctrl/pinctrl-ingenic.c F: drivers/power/supply/ingenic-battery.c F: drivers/pwm/pwm-jz4740.c +F: drivers/remoteproc/ingenic_rproc.c F: drivers/rtc/rtc-jz4740.c F: drivers/tty/serial/8250/8250_ingenic.c F: drivers/usb/musb/jz4740.c @@ -14840,6 +14841,7 @@ S: Supported W: http://www.ibm.com/developerworks/linux/linux390/ F: arch/s390/pci/ F: drivers/pci/hotplug/s390_pci_hpc.c +F: Documentation/s390/pci.rst S390 VFIO AP DRIVER M: Tony Krowiak diff --git a/arch/alpha/boot/bootp.c b/arch/alpha/boot/bootp.c index 95c0359f485895676cba97b1f8defc15b0970852..00266e6e1b71486ace6707fdb0c416e6a323fab9 100644 --- a/arch/alpha/boot/bootp.c +++ b/arch/alpha/boot/bootp.c @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/arch/alpha/boot/bootpz.c b/arch/alpha/boot/bootpz.c index 99b8d7dc344bffc5e879c46eeeebc857c605594e..43af71835adf8aa5494614186193b6d9ee1e6d77 100644 --- a/arch/alpha/boot/bootpz.c +++ b/arch/alpha/boot/bootpz.c @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/arch/alpha/boot/main.c b/arch/alpha/boot/main.c index 8f5ed861097037bbf3d01e6b5672c68c02cd4b3e..e5347a08000867816c67ec542a0e31a5c4de15d9 100644 --- a/arch/alpha/boot/main.c +++ b/arch/alpha/boot/main.c @@ -14,7 +14,6 @@ #include #include -#include #include diff --git a/arch/alpha/include/asm/cacheflush.h b/arch/alpha/include/asm/cacheflush.h index 89128489cb598b27aa97c90f71da6564a96ce2c9..9945ff483eaf7e469638a9e1477960230977d8e9 100644 --- a/arch/alpha/include/asm/cacheflush.h +++ b/arch/alpha/include/asm/cacheflush.h @@ -4,19 +4,6 @@ #include -/* Caches aren't brain-dead on the Alpha. */ -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 -#define flush_dcache_page(page) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) - /* Note that the following two definitions are _highly_ dependent on the contexts in which they are used in the kernel. I personally think it is criminal how loosely defined these macros are. */ @@ -48,7 +35,7 @@ extern void smp_imb(void); extern void __load_new_mm_context(struct mm_struct *); static inline void -flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { if (vma->vm_flags & VM_EXEC) { @@ -59,20 +46,17 @@ flush_icache_user_range(struct vm_area_struct *vma, struct page *page, mm->context[smp_processor_id()] = 0; } } -#else -extern void flush_icache_user_range(struct vm_area_struct *vma, +#define flush_icache_user_page flush_icache_user_page +#else /* CONFIG_SMP */ +extern void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); -#endif +#define flush_icache_user_page flush_icache_user_page +#endif /* CONFIG_SMP */ /* This is used only in __do_fault and do_swap_page. */ #define flush_icache_page(vma, page) \ - flush_icache_user_range((vma), (page), 0, 0) + flush_icache_user_page((vma), (page), 0, 0) -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ -do { memcpy(dst, src, len); \ - flush_icache_user_range(vma, page, vaddr, len); \ -} while (0) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) +#include #endif /* _ALPHA_CACHEFLUSH_H */ diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index d1ed5a8133c5df21e357bd4aae2ecc9f53b8f1a1..13bea465f1c063d104ac52b436dd149a3d043fd9 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 0267aa8a4f8647351eebdde01c0875f2c58cf00c..162c17b2631fb00f26590667d5eb9594996966d7 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -276,15 +276,6 @@ extern inline pte_t pte_mkwrite(pte_t pte) { pte_val(pte) &= ~_PAGE_FOW; return extern inline pte_t pte_mkdirty(pte_t pte) { pte_val(pte) |= __DIRTY_BITS; return pte; } extern inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= __ACCESS_BITS; return pte; } -#define PAGE_DIR_OFFSET(tsk,address) pgd_offset((tsk),(address)) - -/* to find an entry in a kernel page-table-directory */ -#define pgd_offset_k(address) pgd_offset(&init_mm, (address)) - -/* to find an entry in a page-table-directory. */ -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) -#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) - /* * The smp_read_barrier_depends() in the following functions are required to * order the load of *dir (the pointer in the top level page table) with any @@ -305,6 +296,7 @@ extern inline pmd_t * pmd_offset(pud_t * dir, unsigned long address) smp_read_barrier_depends(); /* see above */ return ret; } +#define pmd_offset pmd_offset /* Find an entry in the third-level page table.. */ extern inline pte_t * pte_offset_kernel(pmd_t * dir, unsigned long address) @@ -314,9 +306,7 @@ extern inline pte_t * pte_offset_kernel(pmd_t * dir, unsigned long address) smp_read_barrier_depends(); /* see above */ return ret; } - -#define pte_offset_map(dir,addr) pte_offset_kernel((dir),(addr)) -#define pte_unmap(pte) do { } while (0) +#define pte_offset_kernel pte_offset_kernel extern pgd_t swapper_pg_dir[1024]; @@ -355,8 +345,6 @@ extern inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) extern void paging_init(void); -#include - /* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT. */ #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 48b81d015d8a86107eec61c2f1816f5cc65539b4..b45f0b0d6511bfccccd8515eb531089605fde5c1 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/proto.h b/arch/alpha/kernel/proto.h index f1fce942fddcf3ceec4a7616c0f98912934afd3c..701a05090141700fd8f29c2fff79db7968ea67e0 100644 --- a/arch/alpha/kernel/proto.h +++ b/arch/alpha/kernel/proto.h @@ -2,8 +2,6 @@ #include #include -#include - /* Prototypes of functions used across modules here in this directory. */ #define vucp volatile unsigned char * diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c index cb8d599e72d661a080e8012e4d3ba499b349650c..8c43212ae38e6bb8bbf9b2386e1e0089b248b6ba 100644 --- a/arch/alpha/kernel/ptrace.c +++ b/arch/alpha/kernel/ptrace.c @@ -19,7 +19,6 @@ #include #include -#include #include #include "proto.h" diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 6fa802c495b464a820f2392851c13a70f8763b29..f5c42a8fcf9c83a7a00c07516a6f0c96f0269f66 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -55,7 +55,6 @@ static struct notifier_block alpha_panic_block = { }; #include -#include #include #include #include diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 5f90df30be20a699d2ffde53234c108c55eb81e9..631cc17410d1508cfdf0a0dc3dcc34c6adac5483 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -36,7 +36,6 @@ #include #include -#include #include #include #include @@ -740,7 +739,7 @@ ipi_flush_icache_page(void *x) } void -flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { struct mm_struct *mm = vma->vm_mm; diff --git a/arch/alpha/kernel/sys_alcor.c b/arch/alpha/kernel/sys_alcor.c index ce5430056f65f9d1fec882f2cc413c743cc72790..e063b3857b3d56cccbd5ab2f5f9084c7f5cdd40f 100644 --- a/arch/alpha/kernel/sys_alcor.c +++ b/arch/alpha/kernel/sys_alcor.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_cabriolet.c b/arch/alpha/kernel/sys_cabriolet.c index 0aa6a27d0e2f8ca5da9b15bb04a7b8e732ee5978..47459b73cdb7e2ee0afd551cfd79ca3b80f5a265 100644 --- a/arch/alpha/kernel/sys_cabriolet.c +++ b/arch/alpha/kernel/sys_cabriolet.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c index d335086218207d9327df05499ded58ea2b4deeff..9fb445d7dca52c8f7591cf62605b0ac7713d7f6b 100644 --- a/arch/alpha/kernel/sys_dp264.c +++ b/arch/alpha/kernel/sys_dp264.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_eb64p.c b/arch/alpha/kernel/sys_eb64p.c index 1cdfe55fb98735d5b18b57614027a5c9d9e44dbc..3c43fd34752660739b5e003eb076244c38599bdd 100644 --- a/arch/alpha/kernel/sys_eb64p.c +++ b/arch/alpha/kernel/sys_eb64p.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_eiger.c b/arch/alpha/kernel/sys_eiger.c index 016f79251141a547f47f1c98c0bfc13c1ca98851..bf99dcfd40c4e3932467c83f1bd0163c93c9fcec 100644 --- a/arch/alpha/kernel/sys_eiger.c +++ b/arch/alpha/kernel/sys_eiger.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_jensen.c b/arch/alpha/kernel/sys_jensen.c index d0d44f543d77c9d715dfcbc5ed5913ac6dded773..0a2ab6cb18db49c9e89c640b4a13ab1956ae868f 100644 --- a/arch/alpha/kernel/sys_jensen.c +++ b/arch/alpha/kernel/sys_jensen.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include "proto.h" diff --git a/arch/alpha/kernel/sys_marvel.c b/arch/alpha/kernel/sys_marvel.c index 533899a4a1a1daf21329b8963b2e6977759d8a25..83d6c53d6d4d14c545e3ea346eaa3d4a8f7afc12 100644 --- a/arch/alpha/kernel/sys_marvel.c +++ b/arch/alpha/kernel/sys_marvel.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_miata.c b/arch/alpha/kernel/sys_miata.c index 702292af2225d571952cb30d09d66cb908bad37f..e1bee8f84c5877ce2be3bdd0470963b23d21bd9d 100644 --- a/arch/alpha/kernel/sys_miata.c +++ b/arch/alpha/kernel/sys_miata.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_mikasa.c b/arch/alpha/kernel/sys_mikasa.c index 3af4f94113e1701e6729b2932a8055b3c6902201..7690dfd57cb6bc5aaf8c3bcc92e8a033c32b817a 100644 --- a/arch/alpha/kernel/sys_mikasa.c +++ b/arch/alpha/kernel/sys_mikasa.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c index 32850e45834b0c1959f20f21f07e2d0db2757679..53adf43dcd44fca8f6cd5db06932a5e5e8612f0c 100644 --- a/arch/alpha/kernel/sys_nautilus.c +++ b/arch/alpha/kernel/sys_nautilus.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_noritake.c b/arch/alpha/kernel/sys_noritake.c index b106f327f765268541a6f4f29033eda9f3898328..47f3ce4f719adabeef155daaa8d0e8a7e6cd1d0a 100644 --- a/arch/alpha/kernel/sys_noritake.c +++ b/arch/alpha/kernel/sys_noritake.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_rawhide.c b/arch/alpha/kernel/sys_rawhide.c index b76f65d0e8b54f51e3874116ebd37e4223c33bb5..b5846ffdadce5b3fa713c7522e394f3da106955e 100644 --- a/arch/alpha/kernel/sys_rawhide.c +++ b/arch/alpha/kernel/sys_rawhide.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_ruffian.c b/arch/alpha/kernel/sys_ruffian.c index d3307401196029b012959884b7b4e295911ef641..4b1c8d85c4f0839a9c0fc53a3f3d3b60169eecc4 100644 --- a/arch/alpha/kernel/sys_ruffian.c +++ b/arch/alpha/kernel/sys_ruffian.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_rx164.c b/arch/alpha/kernel/sys_rx164.c index 4d85eaeb44aa49dfd104d69860df21777d47d520..94046f9aea08d4f6c2bb1832e6119caa2831ce9c 100644 --- a/arch/alpha/kernel/sys_rx164.c +++ b/arch/alpha/kernel/sys_rx164.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_sable.c b/arch/alpha/kernel/sys_sable.c index 3cf0d32da5d80adf41c4357a927bda5617e933c1..930005b2f630dd531d6d32d28101e9ae9bd4e9e4 100644 --- a/arch/alpha/kernel/sys_sable.c +++ b/arch/alpha/kernel/sys_sable.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_sio.c b/arch/alpha/kernel/sys_sio.c index a6bdc1da47adb7ed293c365b7ec59068f4098a69..7c420d8dac53d9da2d2ccc38ec09878a6472df67 100644 --- a/arch/alpha/kernel/sys_sio.c +++ b/arch/alpha/kernel/sys_sio.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_sx164.c b/arch/alpha/kernel/sys_sx164.c index 17cc203176c803d22209a39f244bc8967d954539..dd9de84b630c35556bb407082c4d953867ae6268 100644 --- a/arch/alpha/kernel/sys_sx164.c +++ b/arch/alpha/kernel/sys_sx164.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_takara.c b/arch/alpha/kernel/sys_takara.c index e230c68640883e3733059a99e3dfc7c83140677a..9e2adb69bc74e8c7a9295c89f91a6b1ee67c1f66 100644 --- a/arch/alpha/kernel/sys_takara.c +++ b/arch/alpha/kernel/sys_takara.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c index c8390d8de14003849665a154bf251d590b210bda..b1f3b4fcf99b2c11e46984eb353f0a7a9696809c 100644 --- a/arch/alpha/kernel/sys_titan.c +++ b/arch/alpha/kernel/sys_titan.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_wildfire.c b/arch/alpha/kernel/sys_wildfire.c index 2191bde161fdef2d1a017edfcac7650a6c61a467..2c54d707142ab746ad00bd0f80863e9c9a4f56b4 100644 --- a/arch/alpha/kernel/sys_wildfire.c +++ b/arch/alpha/kernel/sys_wildfire.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index f6b9664ac5042d7eca61c9889c321648cf532761..49754e07e04f47e596ad0662b484283b974cb786 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -121,10 +121,10 @@ dik_show_code(unsigned int *pc) } static void -dik_show_trace(unsigned long *sp) +dik_show_trace(unsigned long *sp, const char *loglvl) { long i = 0; - printk("Trace:\n"); + printk("%sTrace:\n", loglvl); while (0x1ff8 & (unsigned long) sp) { extern char _stext[], _etext[]; unsigned long tmp = *sp; @@ -133,24 +133,24 @@ dik_show_trace(unsigned long *sp) continue; if (tmp >= (unsigned long) &_etext) continue; - printk("[<%lx>] %pSR\n", tmp, (void *)tmp); + printk("%s[<%lx>] %pSR\n", loglvl, tmp, (void *)tmp); if (i > 40) { - printk(" ..."); + printk("%s ...", loglvl); break; } } - printk("\n"); + printk("%s\n", loglvl); } static int kstack_depth_to_print = 24; -void show_stack(struct task_struct *task, unsigned long *sp) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { unsigned long *stack; int i; /* - * debugging aid: "show_stack(NULL);" prints the + * debugging aid: "show_stack(NULL, NULL, KERN_EMERG);" prints the * back trace for this cpu. */ if(sp==NULL) @@ -163,14 +163,14 @@ void show_stack(struct task_struct *task, unsigned long *sp) if ((i % 4) == 0) { if (i) pr_cont("\n"); - printk(" "); + printk("%s ", loglvl); } else { pr_cont(" "); } pr_cont("%016lx", *stack++); } pr_cont("\n"); - dik_show_trace(sp); + dik_show_trace(sp, loglvl); } void @@ -184,7 +184,7 @@ die_if_kernel(char * str, struct pt_regs *regs, long err, unsigned long *r9_15) printk("%s(%d): %s %ld\n", current->comm, task_pid_nr(current), str, err); dik_show_regs(regs, r9_15); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); - dik_show_trace((unsigned long *)(regs+1)); + dik_show_trace((unsigned long *)(regs+1), KERN_DEFAULT); dik_show_code((unsigned int *)regs->pc); if (test_and_set_thread_flag (TIF_DIE_IF_KERNEL)) { @@ -625,7 +625,7 @@ got_exception: printk("gp = %016lx sp = %p\n", regs->gp, regs+1); dik_show_code((unsigned int *)pc); - dik_show_trace((unsigned long *)(regs+1)); + dik_show_trace((unsigned long *)(regs+1), KERN_DEFAULT); if (test_and_set_thread_flag (TIF_DIE_IF_KERNEL)) { printk("die_if_kernel recursion detected.\n"); @@ -957,12 +957,12 @@ give_sigsegv: si_code = SEGV_ACCERR; else { struct mm_struct *mm = current->mm; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); if (find_vma(mm, (unsigned long)va)) si_code = SEGV_ACCERR; else si_code = SEGV_MAPERR; - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); } send_sig_fault(SIGSEGV, si_code, va, 0, current); return; diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index c2d7b6d7bac75b1705fbadd2b59dc3b0989ed8ee..c2303a8c2b9f7ca40cdb7d01e03fb0a7bde8a2a9 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -117,7 +117,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, if (user_mode(regs)) flags |= FAULT_FLAG_USER; retry: - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, address); if (!vma) goto bad_area; @@ -171,7 +171,7 @@ retry: if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; - /* No need to up_read(&mm->mmap_sem) as we would + /* No need to mmap_read_unlock(mm) as we would * have already released it in __lock_page_or_retry * in mm/filemap.c. */ @@ -180,14 +180,14 @@ retry: } } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return; /* Something tried to access memory that isn't in our memory map. Fix it, but check if it's kernel or user first. */ bad_area: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); if (user_mode(regs)) goto do_sigsegv; @@ -211,14 +211,14 @@ retry: /* We ran out of memory, or some other thing happened to us that made us unable to handle the page fault gracefully. */ out_of_memory: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); if (!user_mode(regs)) goto no_context; pagefault_out_of_memory(); return; do_sigbus: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); /* Send a sigbus, regardless of whether we were in kernel or user mode. */ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *) address, 0); diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 667cd21393b5b4603698bd987b654f203e4a1f97..3c42b3147fd6f00168609b243805f87c5099e921 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include diff --git a/arch/arc/include/asm/bug.h b/arch/arc/include/asm/bug.h index 0be19fd1a4126ae42bcbd3dc4cd32a2f4e292748..4c453ba96c51956a0314eaa87d0a92959c725e9a 100644 --- a/arch/arc/include/asm/bug.h +++ b/arch/arc/include/asm/bug.h @@ -13,7 +13,8 @@ struct task_struct; void show_regs(struct pt_regs *regs); -void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs); +void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs, + const char *loglvl); void show_kernel_fault_diag(const char *str, struct pt_regs *regs, unsigned long address); void die(const char *str, struct pt_regs *regs, unsigned long address); diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 12be7e1b7cc07520b2e9d8a6d78196c6aa6b4277..f1ed17edb085b901f3fcd0980a302dd971c14090 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -248,9 +248,6 @@ extern char empty_zero_page[PAGE_SIZE]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) -#define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) - #define set_pte(pteptr, pteval) ((*(pteptr)) = (pteval)) #define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) @@ -282,18 +279,6 @@ static inline void pmd_set(pmd_t *pmdp, pte_t *ptep) /* Don't use virt_to_pfn for macros below: could cause truncations for PAE40*/ #define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT) -#define __pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) - -/* - * pte_offset gets a @ptr to PMD entry (PGD in our 2-tier paging system) - * and returns ptr to PTE entry corresponding to @addr - */ -#define pte_offset(dir, addr) ((pte_t *)(pmd_page_vaddr(*dir)) +\ - __pte_index(addr)) - -/* No mapping of Page Tables in high mem etc, so following same as above */ -#define pte_offset_kernel(dir, addr) pte_offset(dir, addr) -#define pte_offset_map(dir, addr) pte_offset(dir, addr) /* Zoo of pte_xxx function */ #define pte_read(pte) (pte_val(pte) & _PAGE_READ) @@ -331,13 +316,6 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, set_pte(ptep, pteval); } -/* - * All kernel related VM pages are in init's mm. - */ -#define pgd_offset_k(address) pgd_offset(&init_mm, address) -#define pgd_index(addr) ((addr) >> PGDIR_SHIFT) -#define pgd_offset(mm, addr) (((mm)->pgd)+pgd_index(addr)) - /* * Macro to quickly access the PGD entry, utlising the fact that some * arch may cache the pointer to Page Directory of "current" task @@ -390,8 +368,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, #include #endif -#include - /* to cope with aliasing VIPT cache */ #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index 315528f04bc128ed184f22af58a093a3ad68f053..8c8e5172fecd51ffe1b627c74289b91277dfb122 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -90,10 +90,10 @@ fault: if (unlikely(ret != -EFAULT)) goto fail; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); ret = fixup_user_fault(current, current->mm, (unsigned long) uaddr, FAULT_FLAG_WRITE, NULL); - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (likely(!ret)) goto again; diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c index 1e440bbfa876ba2b26b667a29d4a8bdc577f9dcc..feba91c9d969cae7a86afb4daba599c26a098290 100644 --- a/arch/arc/kernel/stacktrace.c +++ b/arch/arc/kernel/stacktrace.c @@ -158,9 +158,11 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs, /* Call-back which plugs into unwinding core to dump the stack in * case of panic/OOPs/BUG etc */ -static int __print_sym(unsigned int address, void *unused) +static int __print_sym(unsigned int address, void *arg) { - printk(" %pS\n", (void *)address); + const char *loglvl = arg; + + printk("%s %pS\n", loglvl, (void *)address); return 0; } @@ -217,17 +219,18 @@ static int __get_first_nonsched(unsigned int address, void *unused) *------------------------------------------------------------------------- */ -noinline void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs) +noinline void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs, + const char *loglvl) { - pr_info("\nStack Trace:\n"); - arc_unwind_core(tsk, regs, __print_sym, NULL); + printk("%s\nStack Trace:\n", loglvl); + arc_unwind_core(tsk, regs, __print_sym, (void *)loglvl); } EXPORT_SYMBOL(show_stacktrace); /* Expected by sched Code */ -void show_stack(struct task_struct *tsk, unsigned long *sp) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { - show_stacktrace(tsk, NULL); + show_stacktrace(tsk, NULL, loglvl); } /* Another API expected by schedular, shows up in "ps" as Wait Channel diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c index 3393558876a9b154ef723b8c0e025ca1f96ab633..28e8bf04b253f47be704cc375907d270536ecb94 100644 --- a/arch/arc/kernel/troubleshoot.c +++ b/arch/arc/kernel/troubleshoot.c @@ -89,7 +89,7 @@ static void show_faulting_vma(unsigned long address) /* can't use print_vma_addr() yet as it doesn't check for * non-inclusive vma */ - down_read(&active_mm->mmap_sem); + mmap_read_lock(active_mm); vma = find_vma(active_mm, address); /* check against the find_vma( ) behaviour which returns the next VMA @@ -111,7 +111,7 @@ static void show_faulting_vma(unsigned long address) } else pr_info(" @No matching VMA found\n"); - up_read(&active_mm->mmap_sem); + mmap_read_unlock(active_mm); } static void show_ecr_verbose(struct pt_regs *regs) @@ -240,5 +240,5 @@ void show_kernel_fault_diag(const char *str, struct pt_regs *regs, /* Show stack trace if this Fatality happened in kernel mode */ if (!user_mode(regs)) - show_stacktrace(current, regs); + show_stacktrace(current, regs, KERN_DEFAULT); } diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 92b339c7adbaad4b5faf0cbac91a4746fdf5009e..72f5405a7ec59de4655c2edd0036c210ed19a92c 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -107,7 +107,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs) flags |= FAULT_FLAG_WRITE; retry: - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, address); if (!vma) @@ -141,7 +141,7 @@ retry: } /* - * Fault retry nuances, mmap_sem already relinquished by core mm + * Fault retry nuances, mmap_lock already relinquished by core mm */ if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) { @@ -150,7 +150,7 @@ retry: } bad_area: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); /* * Major/minor page fault accounting diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index 479b0d72d3cf346299ac26b38b1bfbd0f8e9d634..1b9f473c6369306067f5da42f46ea5811a037e7e 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -6,8 +6,8 @@ #include #include #include +#include #include -#include #include #include @@ -92,17 +92,9 @@ EXPORT_SYMBOL(kunmap_atomic_high); static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr) { - pgd_t *pgd_k; - p4d_t *p4d_k; - pud_t *pud_k; - pmd_t *pmd_k; + pmd_t *pmd_k = pmd_off_k(kvaddr); pte_t *pte_k; - pgd_k = pgd_offset_k(kvaddr); - p4d_k = p4d_offset(pgd_k, kvaddr); - pud_k = pud_offset(p4d_k, kvaddr); - pmd_k = pmd_offset(pud_k, kvaddr); - pte_k = (pte_t *)memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); if (!pte_k) panic("%s: Failed to allocate %lu bytes align=0x%lx\n", diff --git a/arch/arc/mm/tlbex.S b/arch/arc/mm/tlbex.S index 2efaf6ca0c06821f60a86748b8d1d1a26946e13d..31f54bdd95f2c9480aaa3702b4139d25874207db 100644 --- a/arch/arc/mm/tlbex.S +++ b/arch/arc/mm/tlbex.S @@ -33,9 +33,9 @@ */ #include +#include #include #include -#include #include #include #include diff --git a/arch/arm/include/asm/bug.h b/arch/arm/include/asm/bug.h index deef4d0cb3b50030c621eb200c2226af6b1f6685..673c7dd75ab90c0a6475bacde3adec9743adfd15 100644 --- a/arch/arm/include/asm/bug.h +++ b/arch/arm/include/asm/bug.h @@ -82,7 +82,8 @@ void hook_ifault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *), int sig, int code, const char *name); -extern asmlinkage void c_backtrace(unsigned long fp, int pmode); +extern asmlinkage void c_backtrace(unsigned long fp, int pmode, + const char *loglvl); struct mm_struct; void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr); diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 7114b9aa46b875c5a506332102bce1c722ea7468..2e24e765e6d3af5d6b7842d45d333d3061bfd1ca 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -258,11 +258,11 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr #define flush_cache_dup_mm(mm) flush_cache_mm(mm) /* - * flush_cache_user_range is used when we want to ensure that the + * flush_icache_user_range is used when we want to ensure that the * Harvard caches are synchronised for the user space address range. * This is used for the ARM private sys_cacheflush system call. */ -#define flush_cache_user_range(s,e) __cpuc_coherent_user_range(s,e) +#define flush_icache_user_range(s,e) __cpuc_coherent_user_range(s,e) /* * Perform necessary cache operations to ensure that data previously @@ -318,9 +318,6 @@ extern void flush_kernel_dcache_page(struct page *); #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) -#define flush_icache_user_range(vma,page,addr,len) \ - flush_dcache_page(page) - /* * We don't appear to need to do anything here. In fact, if we did, we'd * duplicate cache flushing elsewhere performed by flush_dcache_page(). diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h index 9383f236e7958fc60028fe9125ab87ca9daff0ee..84dc0ba822f58dd3601c24e48513d21d4b01f270 100644 --- a/arch/arm/include/asm/efi.h +++ b/arch/arm/include/asm/efi.h @@ -13,7 +13,6 @@ #include #include #include -#include #include #ifdef CONFIG_EFI diff --git a/arch/arm/include/asm/fixmap.h b/arch/arm/include/asm/fixmap.h index 472c93db5dac50b31d75e854b48b69cb4748284a..fc56fc3e19316d423155be0e40632b3282656da6 100644 --- a/arch/arm/include/asm/fixmap.h +++ b/arch/arm/include/asm/fixmap.h @@ -6,8 +6,8 @@ #define FIXADDR_END 0xfff00000UL #define FIXADDR_TOP (FIXADDR_END - PAGE_SIZE) +#include #include -#include enum fixed_addresses { FIX_EARLYCON_MEM_BASE, diff --git a/arch/arm/include/asm/idmap.h b/arch/arm/include/asm/idmap.h index 73ba956e379f752ed907d443c89187210c1a1a37..aab7e8358e6ac0e92b25067e6cd937d7b2e7bad4 100644 --- a/arch/arm/include/asm/idmap.h +++ b/arch/arm/include/asm/idmap.h @@ -3,7 +3,7 @@ #define __ASM_IDMAP_H #include -#include +#include /* Tag a function as requiring to be executed via an identity mapping. */ #define __idmap __section(.idmap.text) noinline notrace diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h index 9e084a464a9784d2a724cbd3a700fb53401175a4..3502c2f746ca730c441ad7eb4cd7b1fcbcb78d59 100644 --- a/arch/arm/include/asm/pgtable-2level.h +++ b/arch/arm/include/asm/pgtable-2level.h @@ -187,6 +187,7 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr) { return (pmd_t *)pud; } +#define pmd_offset pmd_offset #define pmd_large(pmd) (pmd_val(pmd) & 2) #define pmd_leaf(pmd) (pmd_val(pmd) & 2) diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 1933aed9f68db8663ff782f6bf3cd50b1c42407c..fbb6693c33528055a3f6015b779eb4d1554e2c61 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -133,13 +133,6 @@ static inline pmd_t *pud_page_vaddr(pud_t pud) return __va(pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK); } -/* Find an entry in the second-level page table.. */ -#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) -static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr) -{ - return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(addr); -} - #define pmd_bad(pmd) (!(pmd_val(pmd) & 2)) #define copy_pmd(pmdpd,pmdps) \ diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h index 30fb2330f57b8730f0ea4fef1ea763c5811a5523..d16aba48fa0a411d7ee851ba8ded85fe383d8e5c 100644 --- a/arch/arm/include/asm/pgtable-nommu.h +++ b/arch/arm/include/asm/pgtable-nommu.h @@ -22,7 +22,6 @@ #define pgd_bad(pgd) (0) #define pgd_clear(pgdp) #define kern_addr_valid(addr) (1) -#define pmd_offset(a, b) ((void *)0) /* FIXME */ /* * PMD_SHIFT determines the size of the area a second-level page table can map @@ -73,8 +72,6 @@ extern unsigned int kobjsize(const void *objp); #define FIRST_USER_ADDRESS 0UL -#include - #else /* diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index fba20607c53cc0bb8200fb774630ee44d9a2a691..c02f24400369bccb68408f46476198a45eacde69 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -166,14 +166,6 @@ extern struct page *empty_zero_page; extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; -/* to find an entry in a page-table-directory */ -#define pgd_index(addr) ((addr) >> PGDIR_SHIFT) - -#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr)) - -/* to find an entry in a kernel page-table-directory */ -#define pgd_offset_k(addr) pgd_offset(&init_mm, addr) - #define pmd_none(pmd) (!pmd_val(pmd)) static inline pte_t *pmd_page_vaddr(pmd_t pmd) @@ -183,21 +175,6 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd) #define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) -#ifndef CONFIG_HIGHPTE -#define __pte_map(pmd) pmd_page_vaddr(*(pmd)) -#define __pte_unmap(pte) do { } while (0) -#else -#define __pte_map(pmd) (pte_t *)kmap_atomic(pmd_page(*(pmd))) -#define __pte_unmap(pte) kunmap_atomic(pte) -#endif - -#define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) - -#define pte_offset_kernel(pmd,addr) (pmd_page_vaddr(*(pmd)) + pte_index(addr)) - -#define pte_offset_map(pmd,addr) (__pte_map(pmd) + pte_index(addr)) -#define pte_unmap(pte) __pte_unmap(pte) - #define pte_pfn(pte) ((pte_val(pte) & PHYS_MASK) >> PAGE_SHIFT) #define pfn_pte(pfn,prot) __pte(__pfn_to_phys(pfn) | pgprot_val(prot)) @@ -339,8 +316,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) /* FIXME: this is not correct */ #define kern_addr_valid(addr) (1) -#include - /* * We provide our own arch_get_unmapped_area to cope with VIPT caches. */ diff --git a/arch/arm/include/asm/traps.h b/arch/arm/include/asm/traps.h index 172b08ff3760d43bc46306b630a64646b65d5a72..987fefb0a4dbc452fda484512cedf6818838d965 100644 --- a/arch/arm/include/asm/traps.h +++ b/arch/arm/include/asm/traps.h @@ -29,7 +29,8 @@ static inline int __in_irqentry_text(unsigned long ptr) } extern void __init early_trap_init(void *); -extern void dump_backtrace_entry(unsigned long where, unsigned long from, unsigned long frame); +extern void dump_backtrace_entry(unsigned long where, unsigned long from, + unsigned long frame, const char *loglvl); extern void ptrace_break(struct pt_regs *regs); extern void *vectors_page; diff --git a/arch/arm/include/asm/unwind.h b/arch/arm/include/asm/unwind.h index 6e282c33126b6f8365308fd91c4bce33d7448764..0f8a3439902d0613ac2b4ef98a3561d950ec6a43 100644 --- a/arch/arm/include/asm/unwind.h +++ b/arch/arm/include/asm/unwind.h @@ -36,7 +36,8 @@ extern struct unwind_table *unwind_table_add(unsigned long start, unsigned long text_addr, unsigned long text_size); extern void unwind_table_del(struct unwind_table *tab); -extern void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk); +extern void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl); #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm/kernel/fiq.c b/arch/arm/kernel/fiq.c index cd1234c103fcdd7c85fbb7862d05a01dd1734c5e..98ca3e3fa8471f9cd490bf5fe31c7720f89b641f 100644 --- a/arch/arm/kernel/fiq.c +++ b/arch/arm/kernel/fiq.c @@ -98,8 +98,8 @@ void set_fiq_handler(void *start, unsigned int length) memcpy(base + offset, start, length); if (!cache_is_vipt_nonaliasing()) - flush_icache_range((unsigned long)base + offset, offset + - length); + flush_icache_range((unsigned long)base + offset, + (unsigned long)base + offset + length); flush_icache_range(0xffff0000 + offset, 0xffff0000 + offset + length); } diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index c49b39340ddbd8a88a3d810670fa3af7016848cf..f8904227e7fdc44dbb170af6fa4c09f17b3d7247 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -10,6 +10,7 @@ */ #include #include +#include #include #include @@ -18,7 +19,6 @@ #include #include #include -#include #if defined(CONFIG_DEBUG_LL) && !defined(CONFIG_DEBUG_SEMIHOSTING) #include CONFIG_DEBUG_LL_INCLUDE diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c index 76300f3813e89bc48a76d83dc3076d9f7b79ee84..974b6c64d3e6fb5f471dfe8a8b38d121cd80874b 100644 --- a/arch/arm/kernel/machine_kexec.c +++ b/arch/arm/kernel/machine_kexec.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index af0a8500a24ead489fd46b77e450d76fc7d19bad..e15444b25ca05081e1d32878104b362cc75ae19d 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -17,7 +17,6 @@ #include #include -#include #include #include #include diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 46e478fb5ea203a7a42650ce4233d801db0c4277..58eaa1f60e16402558d79de0113cf0136b803b23 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -431,7 +431,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) npages = 1; /* for sigpage */ npages += vdso_total_pages; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; hint = sigpage_addr(mm, npages); addr = get_unmapped_area(NULL, hint, npages << PAGE_SHIFT, 0, 0); @@ -458,7 +458,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) arm_install_vdso(mm, addr + PAGE_SIZE); up_fail: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return ret; } #endif diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 4cc6a7eff6359a23633587742bef6e3690e76ecf..d0f7c8896c96788d075995c01de98387cb04aeda 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -25,7 +25,6 @@ #include #include -#include #include #define CREATE_TRACE_POINTS diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 46e1be9e57a817b843903727340eabf6d85a2007..9a6432557871fecb3b7ec36416d359372cfb3859 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/kernel/suspend.c b/arch/arm/kernel/suspend.c index d08099269e35b9a6392e74ec2a52d3b81f439616..d2c9338d74e8157d7c4d191d45efd47d59873546 100644 --- a/arch/arm/kernel/suspend.c +++ b/arch/arm/kernel/suspend.c @@ -2,12 +2,12 @@ #include #include #include +#include #include #include #include #include -#include #include #include #include diff --git a/arch/arm/kernel/swp_emulate.c b/arch/arm/kernel/swp_emulate.c index e640871328c1ff32f47df101c66dfac5a52b4a5e..6166ba38bf99479d62fa9f2e24c5a0f180ba4851 100644 --- a/arch/arm/kernel/swp_emulate.c +++ b/arch/arm/kernel/swp_emulate.c @@ -97,12 +97,12 @@ static void set_segfault(struct pt_regs *regs, unsigned long addr) { int si_code; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); if (find_vma(current->mm, addr) == NULL) si_code = SEGV_MAPERR; else si_code = SEGV_ACCERR; - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); pr_debug("SWP{B} emulation: access caused memory abort!\n"); arm_notify_die("Illegal memory access", regs, diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 1e70e7227f0ff2283b07becf1ed543b3482bb280..65a3b1e7548023948ea8d208b9872e24d74ed031 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -62,21 +62,24 @@ __setup("user_debug=", user_debug_setup); static void dump_mem(const char *, const char *, unsigned long, unsigned long); -void dump_backtrace_entry(unsigned long where, unsigned long from, unsigned long frame) +void dump_backtrace_entry(unsigned long where, unsigned long from, + unsigned long frame, const char *loglvl) { unsigned long end = frame + 4 + sizeof(struct pt_regs); #ifdef CONFIG_KALLSYMS - printk("[<%08lx>] (%ps) from [<%08lx>] (%pS)\n", where, (void *)where, from, (void *)from); + printk("%s[<%08lx>] (%ps) from [<%08lx>] (%pS)\n", + loglvl, where, (void *)where, from, (void *)from); #else - printk("Function entered at [<%08lx>] from [<%08lx>]\n", where, from); + printk("%sFunction entered at [<%08lx>] from [<%08lx>]\n", + loglvl, where, from); #endif if (in_entry_text(from) && end <= ALIGN(frame, THREAD_SIZE)) - dump_mem("", "Exception stack", frame + 4, end); + dump_mem(loglvl, "Exception stack", frame + 4, end); } -void dump_backtrace_stm(u32 *stack, u32 instruction) +void dump_backtrace_stm(u32 *stack, u32 instruction, const char *loglvl) { char str[80], *p; unsigned int x; @@ -88,12 +91,12 @@ void dump_backtrace_stm(u32 *stack, u32 instruction) if (++x == 6) { x = 0; p = str; - printk("%s\n", str); + printk("%s%s\n", loglvl, str); } } } if (p != str) - printk("%s\n", str); + printk("%s%s\n", loglvl, str); } #ifndef CONFIG_ARM_UNWIND @@ -201,17 +204,19 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) } #ifdef CONFIG_ARM_UNWIND -static inline void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) +static inline void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) { - unwind_backtrace(regs, tsk); + unwind_backtrace(regs, tsk, loglvl); } #else -static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) +static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) { unsigned int fp, mode; int ok = 1; - printk("Backtrace: "); + printk("%sBacktrace: ", loglvl); if (!tsk) tsk = current; @@ -238,13 +243,13 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) pr_cont("\n"); if (ok) - c_backtrace(fp, mode); + c_backtrace(fp, mode, loglvl); } #endif -void show_stack(struct task_struct *tsk, unsigned long *sp) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { - dump_backtrace(NULL, tsk); + dump_backtrace(NULL, tsk, loglvl); barrier(); } @@ -288,7 +293,7 @@ static int __die(const char *str, int err, struct pt_regs *regs) if (!user_mode(regs) || in_interrupt()) { dump_mem(KERN_EMERG, "Stack: ", regs->ARM_sp, THREAD_SIZE + (unsigned long)task_stack_page(tsk)); - dump_backtrace(regs, tsk); + dump_backtrace(regs, tsk, KERN_EMERG); dump_instr(KERN_EMERG, regs); } @@ -566,7 +571,7 @@ __do_cache_op(unsigned long start, unsigned long end) if (fatal_signal_pending(current)) return 0; - ret = flush_cache_user_range(start, start + chunk); + ret = flush_icache_user_range(start, start + chunk); if (ret) return ret; @@ -663,10 +668,10 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs) if (user_debug & UDBG_SYSCALL) { pr_err("[%d] %s: arm syscall %d\n", task_pid_nr(current), current->comm, no); - dump_instr("", regs); + dump_instr(KERN_ERR, regs); if (user_mode(regs)) { __show_regs(regs); - c_backtrace(frame_pointer(regs), processor_mode(regs)); + c_backtrace(frame_pointer(regs), processor_mode(regs), KERN_ERR); } } #endif diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c index 11a964fd66f474b7090a45cf8f65938c65bd12d2..d2bd0df2318d62e3dacefc97d367e2decd75351e 100644 --- a/arch/arm/kernel/unwind.c +++ b/arch/arm/kernel/unwind.c @@ -455,7 +455,8 @@ int unwind_frame(struct stackframe *frame) return URC_OK; } -void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk) +void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) { struct stackframe frame; @@ -493,7 +494,7 @@ void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk) urc = unwind_frame(&frame); if (urc < 0) break; - dump_backtrace_entry(where, frame.pc, frame.sp - 4); + dump_backtrace_entry(where, frame.pc, frame.sp - 4, loglvl); } } diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index e0330a25e1c6df38f672b0c662fe801278088110..6bfdca4769a70cd932302707733929f9d0a54816 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -240,7 +240,7 @@ static int install_vvar(struct mm_struct *mm, unsigned long addr) return PTR_ERR_OR_ZERO(vma); } -/* assumes mmap_sem is write-locked */ +/* assumes mmap_lock is write-locked */ void arm_install_vdso(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 88a720da443b9815b8459dee496af2be69ecfa53..7f24bc08403e162b2243609d0b7fdd87b418a5fe 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -8,13 +8,13 @@ #include "vmlinux-xip.lds.S" #else +#include #include #include #include #include #include #include -#include #include "vmlinux.lds.h" diff --git a/arch/arm/lib/backtrace-clang.S b/arch/arm/lib/backtrace-clang.S index 2ff375144b55b0347240441205f476aae1a837cc..6174c45f53a5dbfc8c0e2de3de8175b6e17a8d5e 100644 --- a/arch/arm/lib/backtrace-clang.S +++ b/arch/arm/lib/backtrace-clang.S @@ -17,6 +17,7 @@ #define sv_pc r6 #define mask r7 #define sv_lr r8 +#define loglvl r9 ENTRY(c_backtrace) @@ -99,6 +100,7 @@ ENDPROC(c_backtrace) @ to ensure 8 byte alignment movs frame, r0 @ if frame pointer is zero beq no_frame @ we have no stack frames + mov loglvl, r2 tst r1, #0x10 @ 26 or 32-bit mode? moveq mask, #0xfc000003 movne mask, #0 @ mask for 32-bit @@ -167,6 +169,7 @@ finished_setup: mov r1, sv_lr mov r2, frame bic r1, r1, mask @ mask PC/LR for the mode + mov r3, loglvl bl dump_backtrace_entry /* @@ -183,6 +186,7 @@ finished_setup: ldr r0, [frame] @ locals are stored in @ the preceding frame subeq r0, r0, #4 + mov r2, loglvl bleq dump_backtrace_stm @ dump saved registers /* @@ -196,7 +200,8 @@ finished_setup: bhi for_each_frame 1006: adr r0, .Lbad - mov r1, frame + mov r1, loglvl + mov r2, frame bl printk no_frame: ldmfd sp!, {r4 - r9, fp, pc} ENDPROC(c_backtrace) @@ -209,7 +214,7 @@ ENDPROC(c_backtrace) .long 1005b, 1006b .popsection -.Lbad: .asciz "Backtrace aborted due to bad frame pointer <%p>\n" +.Lbad: .asciz "%sBacktrace aborted due to bad frame pointer <%p>\n" .align .Lopcode: .word 0xe92d4800 >> 11 @ stmfd sp!, {... fp, lr} .word 0x0b000000 @ bl if these bits are set diff --git a/arch/arm/lib/backtrace.S b/arch/arm/lib/backtrace.S index 582925238d65ea261cc126078462ce2a66421549..872f658638d99710710963266900701a3a69c2a7 100644 --- a/arch/arm/lib/backtrace.S +++ b/arch/arm/lib/backtrace.S @@ -18,6 +18,7 @@ #define sv_pc r6 #define mask r7 #define offset r8 +#define loglvl r9 ENTRY(c_backtrace) @@ -25,9 +26,10 @@ ENTRY(c_backtrace) ret lr ENDPROC(c_backtrace) #else - stmfd sp!, {r4 - r8, lr} @ Save an extra register so we have a location... + stmfd sp!, {r4 - r9, lr} @ Save an extra register so we have a location... movs frame, r0 @ if frame pointer is zero beq no_frame @ we have no stack frames + mov loglvl, r2 tst r1, #0x10 @ 26 or 32-bit mode? ARM( moveq mask, #0xfc000003 ) @@ -73,6 +75,7 @@ for_each_frame: tst frame, mask @ Check for address exceptions ldr r1, [frame, #-4] @ get saved lr mov r2, frame bic r1, r1, mask @ mask PC/LR for the mode + mov r3, loglvl bl dump_backtrace_entry ldr r1, [sv_pc, #-4] @ if stmfd sp!, {args} exists, @@ -80,12 +83,14 @@ for_each_frame: tst frame, mask @ Check for address exceptions teq r3, r1, lsr #11 ldreq r0, [frame, #-8] @ get sp subeq r0, r0, #4 @ point at the last arg + mov r2, loglvl bleq dump_backtrace_stm @ dump saved registers 1004: ldr r1, [sv_pc, #0] @ if stmfd sp!, {..., fp, ip, lr, pc} ldr r3, .Ldsi @ instruction exists, teq r3, r1, lsr #11 subeq r0, frame, #16 + mov r2, loglvl bleq dump_backtrace_stm @ dump saved registers teq sv_fp, #0 @ zero saved fp means @@ -96,9 +101,10 @@ for_each_frame: tst frame, mask @ Check for address exceptions bhi for_each_frame 1006: adr r0, .Lbad - mov r1, frame + mov r1, loglvl + mov r2, frame bl printk -no_frame: ldmfd sp!, {r4 - r8, pc} +no_frame: ldmfd sp!, {r4 - r9, pc} ENDPROC(c_backtrace) .pushsection __ex_table,"a" @@ -109,7 +115,7 @@ ENDPROC(c_backtrace) .long 1004b, 1006b .popsection -.Lbad: .asciz "Backtrace aborted due to bad frame pointer <%p>\n" +.Lbad: .asciz "%sBacktrace aborted due to bad frame pointer <%p>\n" .align .Ldsi: .word 0xe92dd800 >> 11 @ stmfd sp!, {... fp, ip, lr, pc} .word 0xe92d0000 >> 11 @ stmfd sp!, {} diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c index d72b14c966702da7d4dd98b207ccb2e2d2214920..106f83a5ea6d26b1a10884217566d126614e0d55 100644 --- a/arch/arm/lib/uaccess_with_memcpy.c +++ b/arch/arm/lib/uaccess_with_memcpy.c @@ -101,7 +101,7 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) atomic = faulthandler_disabled(); if (!atomic) - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); while (n) { pte_t *pte; spinlock_t *ptl; @@ -109,11 +109,11 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) while (!pin_page_for_write(to, &pte, &ptl)) { if (!atomic) - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (__put_user(0, (char __user *)to)) goto out; if (!atomic) - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); } tocopy = (~(unsigned long)to & ~PAGE_MASK) + 1; @@ -133,7 +133,7 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) spin_unlock(ptl); } if (!atomic) - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); out: return n; @@ -170,17 +170,17 @@ __clear_user_memset(void __user *addr, unsigned long n) return 0; } - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); while (n) { pte_t *pte; spinlock_t *ptl; int tocopy; while (!pin_page_for_write(addr, &pte, &ptl)) { - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (__put_user(0, (char __user *)addr)) goto out; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); } tocopy = (~(unsigned long)addr & ~PAGE_MASK) + 1; @@ -198,7 +198,7 @@ __clear_user_memset(void __user *addr, unsigned long n) else spin_unlock(ptl); } - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); out: return n; diff --git a/arch/arm/mach-ebsa110/core.c b/arch/arm/mach-ebsa110/core.c index 575b2e2b6759f624ddb74e892e6de64fd02b1940..5960e3dfd2bfc0065bd533ec719fcbcd982bafb8 100644 --- a/arch/arm/mach-ebsa110/core.c +++ b/arch/arm/mach-ebsa110/core.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/arch/arm/mach-footbridge/common.c b/arch/arm/mach-footbridge/common.c index 015f75d1c98d02f6aa7f756f59107092ff4e7947..eee095f0e2f6c2726e13692cd326e424a7812747 100644 --- a/arch/arm/mach-footbridge/common.c +++ b/arch/arm/mach-footbridge/common.c @@ -14,7 +14,6 @@ #include #include