summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-bus-i2c-devices-bq32k7
-rw-r--r--Documentation/cgroup-v1/rdma.txt109
-rw-r--r--Documentation/cgroup-v2.txt103
-rw-r--r--Documentation/devicetree/bindings/rtc/armada-380-rtc.txt8
-rw-r--r--Documentation/devicetree/bindings/rtc/cortina,gemini.txt14
-rw-r--r--Documentation/devicetree/bindings/rtc/imxdi-rtc.txt5
-rw-r--r--Documentation/devicetree/bindings/rtc/maxim,ds3231.txt3
-rw-r--r--Documentation/devicetree/bindings/rtc/pcf8563.txt3
-rw-r--r--Documentation/devicetree/bindings/rtc/st,stm32-rtc.txt27
-rw-r--r--Documentation/devicetree/bindings/rtc/sun6i-rtc.txt10
-rw-r--r--Documentation/static-keys.txt4
-rw-r--r--arch/arm/mach-ep93xx/ts72xx.c40
-rw-r--r--arch/arm/mach-ep93xx/ts72xx.h11
-rw-r--r--arch/arm/mach-orion5x/ts78xx-setup.c80
-rw-r--r--arch/m68k/configs/amcore_defconfig14
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/configs/default_defconfig1
-rw-r--r--arch/s390/configs/performance_defconfig1
-rw-r--r--arch/s390/crypto/Makefile2
-rw-r--r--arch/s390/crypto/paes_s390.c619
-rw-r--r--arch/s390/defconfig1
-rw-r--r--arch/s390/include/asm/cpacf.h46
-rw-r--r--arch/s390/include/asm/mmu_context.h4
-rw-r--r--arch/s390/include/asm/pgtable.h14
-rw-r--r--arch/s390/include/asm/pkey.h90
-rw-r--r--arch/s390/include/asm/processor.h19
-rw-r--r--arch/s390/include/asm/uaccess.h23
-rw-r--r--arch/s390/include/uapi/asm/Kbuild1
-rw-r--r--arch/s390/include/uapi/asm/pkey.h112
-rw-r--r--arch/s390/kernel/entry.S33
-rw-r--r--arch/s390/kernel/entry.h1
-rw-r--r--arch/s390/kernel/nmi.c25
-rw-r--r--arch/s390/kernel/process.c18
-rw-r--r--arch/s390/mm/gmap.c6
-rw-r--r--arch/s390/mm/hugetlbpage.c2
-rw-r--r--drivers/crypto/Kconfig32
-rw-r--r--drivers/infiniband/core/Makefile1
-rw-r--r--drivers/infiniband/core/cgroup.c62
-rw-r--r--drivers/infiniband/core/core_priv.h30
-rw-r--r--drivers/infiniband/core/device.c10
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c102
-rw-r--r--drivers/infiniband/core/uverbs_main.c20
-rw-r--r--drivers/rtc/Kconfig18
-rw-r--r--drivers/rtc/Makefile1
-rw-r--r--drivers/rtc/rtc-armada38x.c318
-rw-r--r--drivers/rtc/rtc-au1xxx.c2
-rw-r--r--drivers/rtc/rtc-bfin.c2
-rw-r--r--drivers/rtc/rtc-bq32k.c76
-rw-r--r--drivers/rtc/rtc-dm355evm.c2
-rw-r--r--drivers/rtc/rtc-ds3232.c53
-rw-r--r--drivers/rtc/rtc-gemini.c7
-rw-r--r--drivers/rtc/rtc-imxdi.c33
-rw-r--r--drivers/rtc/rtc-ls1x.c2
-rw-r--r--drivers/rtc/rtc-m48t86.c272
-rw-r--r--drivers/rtc/rtc-mcp795.c183
-rw-r--r--drivers/rtc/rtc-mxc.c2
-rw-r--r--drivers/rtc/rtc-pcf2127.c15
-rw-r--r--drivers/rtc/rtc-rx8010.c24
-rw-r--r--drivers/rtc/rtc-sh.c2
-rw-r--r--drivers/rtc/rtc-snvs.c1
-rw-r--r--drivers/rtc/rtc-stm32.c725
-rw-r--r--drivers/rtc/rtc-sun6i.c182
-rw-r--r--drivers/rtc/rtc-tegra.c41
-rw-r--r--drivers/rtc/rtc-tps65910.c146
-rw-r--r--drivers/s390/block/dasd_eckd.c2
-rw-r--r--drivers/s390/cio/ioasm.c8
-rw-r--r--drivers/s390/crypto/Makefile4
-rw-r--r--drivers/s390/crypto/ap_bus.c10
-rw-r--r--drivers/s390/crypto/ap_card.c24
-rw-r--r--drivers/s390/crypto/ap_queue.c21
-rw-r--r--drivers/s390/crypto/pkey_api.c1148
-rw-r--r--drivers/s390/crypto/zcrypt_api.c5
-rw-r--r--drivers/s390/crypto/zcrypt_api.h2
-rw-r--r--fs/kernfs/dir.c2
-rw-r--r--fs/kernfs/file.c62
-rw-r--r--fs/kernfs/kernfs-internal.h2
-rw-r--r--include/linux/cgroup-defs.h57
-rw-r--r--include/linux/cgroup.h2
-rw-r--r--include/linux/cgroup_rdma.h53
-rw-r--r--include/linux/cgroup_subsys.h4
-rw-r--r--include/linux/compiler.h27
-rw-r--r--include/linux/jump_label.h23
-rw-r--r--include/linux/kernfs.h12
-rw-r--r--include/linux/mfd/tps65910.h1
-rw-r--r--include/linux/platform_data/rtc-m48t86.h16
-rw-r--r--include/linux/timer.h2
-rw-r--r--include/linux/workqueue.h4
-rw-r--r--include/rdma/ib_verbs.h14
-rw-r--r--include/trace/events/timer.h14
-rw-r--r--init/Kconfig10
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/cgroup/Makefile6
-rw-r--r--kernel/cgroup/cgroup-internal.h214
-rw-r--r--kernel/cgroup/cgroup-v1.c1395
-rw-r--r--kernel/cgroup/cgroup.c (renamed from kernel/cgroup.c)2081
-rw-r--r--kernel/cgroup/cpuset.c (renamed from kernel/cpuset.c)0
-rw-r--r--kernel/cgroup/freezer.c (renamed from kernel/cgroup_freezer.c)0
-rw-r--r--kernel/cgroup/namespace.c155
-rw-r--r--kernel/cgroup/pids.c (renamed from kernel/cgroup_pids.c)0
-rw-r--r--kernel/cgroup/rdma.c619
-rw-r--r--kernel/events/core.c6
-rw-r--r--kernel/jump_label.c153
-rw-r--r--kernel/trace/ftrace.c372
-rw-r--r--kernel/trace/trace.c13
-rw-r--r--kernel/trace/trace.h83
-rw-r--r--kernel/trace/trace_benchmark.c4
-rw-r--r--kernel/trace/trace_branch.c83
-rw-r--r--kernel/trace/trace_entries.h6
-rw-r--r--kernel/trace/trace_hwlat.c39
-rw-r--r--kernel/trace/trace_kprobe.c1
-rw-r--r--kernel/trace/trace_probe.c50
-rw-r--r--kernel/trace/trace_uprobe.c4
-rw-r--r--lib/percpu_counter.c5
-rw-r--r--tools/perf/util/cgroup.c26
-rwxr-xr-xtools/testing/ktest/ktest.pl111
115 files changed, 8092 insertions, 2689 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-i2c-devices-bq32k b/Documentation/ABI/testing/sysfs-bus-i2c-devices-bq32k
new file mode 100644
index 000000000000..398b258fb770
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-i2c-devices-bq32k
@@ -0,0 +1,7 @@
+What: /sys/bus/i2c/devices/.../trickle_charge_bypass
+Date: Jan 2017
+KernelVersion: 4.11
+Contact: Enric Balletbo i Serra <eballetbo@gmail.com>
+Description: Attribute for enable/disable the trickle charge bypass
+ The trickle_charge_bypass attribute allows the userspace to
+ enable/disable the Trickle charge FET bypass.
diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.txt
new file mode 100644
index 000000000000..af618171e0eb
--- /dev/null
+++ b/Documentation/cgroup-v1/rdma.txt
@@ -0,0 +1,109 @@
+ RDMA Controller
+ ----------------
+
+Contents
+--------
+
+1. Overview
+ 1-1. What is RDMA controller?
+ 1-2. Why RDMA controller needed?
+ 1-3. How is RDMA controller implemented?
+2. Usage Examples
+
+1. Overview
+
+1-1. What is RDMA controller?
+-----------------------------
+
+RDMA controller allows user to limit RDMA/IB specific resources that a given
+set of processes can use. These processes are grouped using RDMA controller.
+
+RDMA controller defines two resources which can be limited for processes of a
+cgroup.
+
+1-2. Why RDMA controller needed?
+--------------------------------
+
+Currently user space applications can easily take away all the rdma verb
+specific resources such as AH, CQ, QP, MR etc. Due to which other applications
+in other cgroup or kernel space ULPs may not even get chance to allocate any
+rdma resources. This can leads to service unavailability.
+
+Therefore RDMA controller is needed through which resource consumption
+of processes can be limited. Through this controller different rdma
+resources can be accounted.
+
+1-3. How is RDMA controller implemented?
+----------------------------------------
+
+RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
+resource accounting per cgroup, per device using resource pool structure.
+Each such resource pool is limited up to 64 resources in given resource pool
+by rdma cgroup, which can be extended later if required.
+
+This resource pool object is linked to the cgroup css. Typically there
+are 0 to 4 resource pool instances per cgroup, per device in most use cases.
+But nothing limits to have it more. At present hundreds of RDMA devices per
+single cgroup may not be handled optimally, however there is no
+known use case or requirement for such configuration either.
+
+Since RDMA resources can be allocated from any process and can be freed by any
+of the child processes which shares the address space, rdma resources are
+always owned by the creator cgroup css. This allows process migration from one
+to other cgroup without major complexity of transferring resource ownership;
+because such ownership is not really present due to shared nature of
+rdma resources. Linking resources around css also ensures that cgroups can be
+deleted after processes migrated. This allow progress migration as well with
+active resources, even though that is not a primary use case.
+
+Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
+the caller. Same rdma cgroup should be passed while uncharging the resource.
+This also allows process migrated with active RDMA resource to charge
+to new owner cgroup for new resource. It also allows to uncharge resource of
+a process from previously charged cgroup which is migrated to new cgroup,
+even though that is not a primary use case.
+
+Resource pool object is created in following situations.
+(a) User sets the limit and no previous resource pool exist for the device
+of interest for the cgroup.
+(b) No resource limits were configured, but IB/RDMA stack tries to
+charge the resource. So that it correctly uncharge them when applications are
+running without limits and later on when limits are enforced during uncharging,
+otherwise usage count will drop to negative.
+
+Resource pool is destroyed if all the resource limits are set to max and
+it is the last resource getting deallocated.
+
+User should set all the limit to max value if it intents to remove/unconfigure
+the resource pool for a particular device.
+
+IB stack honors limits enforced by the rdma controller. When application
+query about maximum resource limits of IB device, it returns minimum of
+what is configured by user for a given cgroup and what is supported by
+IB device.
+
+Following resources can be accounted by rdma controller.
+ hca_handle Maximum number of HCA Handles
+ hca_object Maximum number of HCA Objects
+
+2. Usage Examples
+-----------------
+
+(a) Configure resource limit:
+echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
+echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
+
+(b) Query resource limit:
+cat /sys/fs/cgroup/rdma/2/rdma.max
+#Output:
+mlx4_0 hca_handle=2 hca_object=2000
+ocrdma1 hca_handle=3 hca_object=max
+
+(c) Query current usage:
+cat /sys/fs/cgroup/rdma/2/rdma.current
+#Output:
+mlx4_0 hca_handle=1 hca_object=20
+ocrdma1 hca_handle=1 hca_object=23
+
+(d) Delete resource limit:
+echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 4cc07ce3b8dd..3b8449f8ac7e 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -47,6 +47,12 @@ CONTENTS
5-3. IO
5-3-1. IO Interface Files
5-3-2. Writeback
+ 5-4. PID
+ 5-4-1. PID Interface Files
+ 5-5. RDMA
+ 5-5-1. RDMA Interface Files
+ 5-6. Misc
+ 5-6-1. perf_event
6. Namespace
6-1. Basics
6-2. The Root and Views
@@ -328,14 +334,12 @@ a process with a non-root euid to migrate a target process into a
cgroup by writing its PID to the "cgroup.procs" file, the following
conditions must be met.
-- The writer's euid must match either uid or suid of the target process.
-
- The writer must have write access to the "cgroup.procs" file.
- The writer must have write access to the "cgroup.procs" file of the
common ancestor of the source and destination cgroups.
-The above three constraints ensure that while a delegatee may migrate
+The above two constraints ensure that while a delegatee may migrate
processes around freely in the delegated sub-hierarchy it can't pull
in from or push out to outside the sub-hierarchy.
@@ -350,10 +354,10 @@ all processes under C0 and C1 belong to U0.
Let's also say U0 wants to write the PID of a process which is
currently in C10 into "C00/cgroup.procs". U0 has write access to the
-file and uid match on the process; however, the common ancestor of the
-source cgroup C10 and the destination cgroup C00 is above the points
-of delegation and U0 would not have write access to its "cgroup.procs"
-files and thus the write will be denied with -EACCES.
+file; however, the common ancestor of the source cgroup C10 and the
+destination cgroup C00 is above the points of delegation and U0 would
+not have write access to its "cgroup.procs" files and thus the write
+will be denied with -EACCES.
2-6. Guidelines
@@ -1119,6 +1123,91 @@ writeback as follows.
vm.dirty[_background]_ratio.
+5-4. PID
+
+The process number controller is used to allow a cgroup to stop any
+new tasks from being fork()'d or clone()'d after a specified limit is
+reached.
+
+The number of tasks in a cgroup can be exhausted in ways which other
+controllers cannot prevent, thus warranting its own controller. For
+example, a fork bomb is likely to exhaust the number of tasks before
+hitting memory restrictions.
+
+Note that PIDs used in this controller refer to TIDs, process IDs as
+used by the kernel.
+
+
+5-4-1. PID Interface Files
+
+ pids.max
+
+ A read-write single value file which exists on non-root cgroups. The
+ default is "max".
+
+ Hard limit of number of processes.
+
+ pids.current
+
+ A read-only single value file which exists on all cgroups.
+
+ The number of processes currently in the cgroup and its descendants.
+
+Organisational operations are not blocked by cgroup policies, so it is
+possible to have pids.current > pids.max. This can be done by either
+setting the limit to be smaller than pids.current, or attaching enough
+processes to the cgroup such that pids.current is larger than
+pids.max. However, it is not possible to violate a cgroup PID policy
+through fork() or clone(). These will return -EAGAIN if the creation
+of a new process would cause a cgroup policy to be violated.
+
+
+5-5. RDMA
+
+The "rdma" controller regulates the distribution and accounting of
+of RDMA resources.
+
+5-5-1. RDMA Interface Files
+
+ rdma.max
+ A readwrite nested-keyed file that exists for all the cgroups
+ except root that describes current configured resource limit
+ for a RDMA/IB device.
+
+ Lines are keyed by device name and are not ordered.
+ Each line contains space separated resource name and its configured
+ limit that can be distributed.
+
+ The following nested keys are defined.
+
+ hca_handle Maximum number of HCA Handles
+ hca_object Maximum number of HCA Objects
+
+ An example for mlx4 and ocrdma device follows.
+
+ mlx4_0 hca_handle=2 hca_object=2000
+ ocrdma1 hca_handle=3 hca_object=max
+
+ rdma.current
+ A read-only file that describes current resource usage.
+ It exists for all the cgroup except root.
+
+ An example for mlx4 and ocrdma device follows.
+
+ mlx4_0 hca_handle=1 hca_object=20
+ ocrdma1 hca_handle=1 hca_object=23
+
+
+5-6. Misc
+
+5-6-1. perf_event
+
+perf_event controller, if not mounted on a legacy hierarchy, is
+automatically enabled on the v2 hierarchy so that perf events can
+always be filtered by cgroup v2 path. The controller can still be
+moved to a legacy hierarchy after v2 hierarchy is populated.
+
+
6. Namespace
6-1. Basics
diff --git a/Documentation/devicetree/bindings/rtc/armada-380-rtc.txt b/Documentation/devicetree/bindings/rtc/armada-380-rtc.txt
index 2eb9d4ee7dc0..c3c9a1226f9a 100644
--- a/Documentation/devicetree/bindings/rtc/armada-380-rtc.txt
+++ b/Documentation/devicetree/bindings/rtc/armada-380-rtc.txt
@@ -1,9 +1,11 @@
-* Real Time Clock of the Armada 38x SoCs
+* Real Time Clock of the Armada 38x/7K/8K SoCs
-RTC controller for the Armada 38x SoCs
+RTC controller for the Armada 38x, 7K and 8K SoCs
Required properties:
-- compatible : Should be "marvell,armada-380-rtc"
+- compatible : Should be one of the following:
+ "marvell,armada-380-rtc" for Armada 38x SoC
+ "marvell,armada-8k-rtc" for Aramda 7K/8K SoCs
- reg: a list of base address and size pairs, one for each entry in
reg-names
- reg names: should contain:
diff --git a/Documentation/devicetree/bindings/rtc/cortina,gemini.txt b/Documentation/devicetree/bindings/rtc/cortina,gemini.txt
new file mode 100644
index 000000000000..4ce4e794ddbb
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/cortina,gemini.txt
@@ -0,0 +1,14 @@
+* Cortina Systems Gemini RTC
+
+Gemini SoC real-time clock.
+
+Required properties:
+- compatible : Should be "cortina,gemini-rtc"
+
+Examples:
+
+rtc@45000000 {
+ compatible = "cortina,gemini-rtc";
+ reg = <0x45000000 0x100>;
+ interrupts = <17 IRQ_TYPE_LEVEL_HIGH>;
+};
diff --git a/Documentation/devicetree/bindings/rtc/imxdi-rtc.txt b/Documentation/devicetree/bindings/rtc/imxdi-rtc.txt
index c9d80d7da141..323cf26374cb 100644
--- a/Documentation/devicetree/bindings/rtc/imxdi-rtc.txt
+++ b/Documentation/devicetree/bindings/rtc/imxdi-rtc.txt
@@ -8,10 +8,13 @@ Required properties:
region.
- interrupts: rtc alarm interrupt
+Optional properties:
+- interrupts: dryice security violation interrupt
+
Example:
rtc@80056000 {
compatible = "fsl,imx53-rtc", "fsl,imx25-rtc";
reg = <0x80056000 2000>;
- interrupts = <29>;
+ interrupts = <29 56>;
};
diff --git a/Documentation/devicetree/bindings/rtc/maxim,ds3231.txt b/Documentation/devicetree/bindings/rtc/maxim,ds3231.txt
index 1ad4c1c2b3b3..85be53a42180 100644
--- a/Documentation/devicetree/bindings/rtc/maxim,ds3231.txt
+++ b/Documentation/devicetree/bindings/rtc/maxim,ds3231.txt
@@ -1,7 +1,8 @@
* Maxim DS3231 Real Time Clock
Required properties:
-see: Documentation/devicetree/bindings/i2c/trivial-admin-guide/devices.rst
+- compatible: Should contain "maxim,ds3231".
+- reg: I2C address for chip.
Optional property:
- #clock-cells: Should be 1.
diff --git a/Documentation/devicetree/bindings/rtc/pcf8563.txt b/Documentation/devicetree/bindings/rtc/pcf8563.txt
index 086c998c5561..36984acbb383 100644
--- a/Documentation/devicetree/bindings/rtc/pcf8563.txt
+++ b/Documentation/devicetree/bindings/rtc/pcf8563.txt
@@ -3,7 +3,8 @@
Philips PCF8563/Epson RTC8564 Real Time Clock
Required properties:
-see: Documentation/devicetree/bindings/i2c/trivial-admin-guide/devices.rst
+- compatible: Should contain "nxp,pcf8563".
+- reg: I2C address for chip.
Optional property:
- #clock-cells: Should be 0.
diff --git a/Documentation/devicetree/bindings/rtc/st,stm32-rtc.txt b/Documentation/devicetree/bindings/rtc/st,stm32-rtc.txt
new file mode 100644
index 000000000000..e2837b951237
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/st,stm32-rtc.txt
@@ -0,0 +1,27 @@
+STM32 Real Time Clock
+
+Required properties:
+- compatible: "st,stm32-rtc".
+- reg: address range of rtc register set.
+- clocks: reference to the clock entry ck_rtc.
+- interrupt-parent: phandle for the interrupt controller.
+- interrupts: rtc alarm interrupt.
+- st,syscfg: phandle for pwrcfg, mandatory to disable/enable backup domain
+ (RTC registers) write protection.
+
+Optional properties (to override default ck_rtc parent clock):
+- assigned-clocks: reference to the ck_rtc clock entry.
+- assigned-clock-parents: phandle of the new parent clock of ck_rtc.
+
+Example:
+
+ rtc: rtc@40002800 {
+ compatible = "st,stm32-rtc";
+ reg = <0x40002800 0x400>;
+ clocks = <&rcc 1 CLK_RTC>;
+ assigned-clocks = <&rcc 1 CLK_RTC>;
+ assigned-clock-parents = <&rcc 1 CLK_LSE>;
+ interrupt-parent = <&exti>;
+ interrupts = <17 1>;
+ st,syscfg = <&pwrcfg>;
+ };
diff --git a/Documentation/devicetree/bindings/rtc/sun6i-rtc.txt b/Documentation/devicetree/bindings/rtc/sun6i-rtc.txt
index f007e428a1ab..945934918b71 100644
--- a/Documentation/devicetree/bindings/rtc/sun6i-rtc.txt
+++ b/Documentation/devicetree/bindings/rtc/sun6i-rtc.txt
@@ -8,10 +8,20 @@ Required properties:
memory mapped region.
- interrupts : IRQ lines for the RTC alarm 0 and alarm 1, in that order.
+Required properties for new device trees
+- clocks : phandle to the 32kHz external oscillator
+- clock-output-names : name of the LOSC clock created
+- #clock-cells : must be equals to 1. The RTC provides two clocks: the
+ LOSC and its external output, with index 0 and 1
+ respectively.
+
Example:
rtc: rtc@01f00000 {
compatible = "allwinner,sun6i-a31-rtc";
reg = <0x01f00000 0x54>;
interrupts = <0 40 4>, <0 41 4>;
+ clock-output-names = "osc32k";
+ clocks = <&ext_osc32k>;
+ #clock-cells = <1>;
};
diff --git a/Documentation/static-keys.txt b/Documentation/static-keys.txt
index ea8d7b4e53f0..32a25fad0c1b 100644
--- a/Documentation/static-keys.txt
+++ b/Documentation/static-keys.txt
@@ -155,7 +155,9 @@ or:
There are a few functions and macros that architectures must implement in order
to take advantage of this optimization. If there is no architecture support, we
-simply fall back to a traditional, load, test, and jump sequence.
+simply fall back to a traditional, load, test, and jump sequence. Also, the
+struct jump_entry table must be at least 4-byte aligned because the
+static_key->entry field makes use of the two least significant bits.
* select HAVE_ARCH_JUMP_LABEL, see: arch/x86/Kconfig
diff --git a/arch/arm/mach-ep93xx/ts72xx.c b/arch/arm/mach-ep93xx/ts72xx.c
index 3b39ea353d30..8a5b6f059498 100644
--- a/arch/arm/mach-ep93xx/ts72xx.c
+++ b/arch/arm/mach-ep93xx/ts72xx.c
@@ -16,7 +16,6 @@
#include <linux/init.h>
#include <linux/platform_device.h>
#include <linux/io.h>
-#include <linux/platform_data/rtc-m48t86.h>
#include <linux/mtd/nand.h>
#include <linux/mtd/partitions.h>
@@ -45,16 +44,6 @@ static struct map_desc ts72xx_io_desc[] __initdata = {
.pfn = __phys_to_pfn(TS72XX_OPTIONS2_PHYS_BASE),
.length = TS72XX_OPTIONS2_SIZE,
.type = MT_DEVICE,
- }, {
- .virtual = (unsigned long)TS72XX_RTC_INDEX_VIRT_BASE,
- .pfn = __phys_to_pfn(TS72XX_RTC_INDEX_PHYS_BASE),
- .length = TS72XX_RTC_INDEX_SIZE,
- .type = MT_DEVICE,
- }, {
- .virtual = (unsigned long)TS72XX_RTC_DATA_VIRT_BASE,
- .pfn = __phys_to_pfn(TS72XX_RTC_DATA_PHYS_BASE),
- .length = TS72XX_RTC_DATA_SIZE,
- .type = MT_DEVICE,
}
};
@@ -179,31 +168,22 @@ static void __init ts72xx_register_flash(void)
}
}
+/*************************************************************************
+ * RTC M48T86
+ *************************************************************************/
+#define TS72XX_RTC_INDEX_PHYS_BASE (EP93XX_CS1_PHYS_BASE + 0x00800000)
+#define TS72XX_RTC_DATA_PHYS_BASE (EP93XX_CS1_PHYS_BASE + 0x01700000)
-static unsigned char ts72xx_rtc_readbyte(unsigned long addr)
-{
- __raw_writeb(addr, TS72XX_RTC_INDEX_VIRT_BASE);
- return __raw_readb(TS72XX_RTC_DATA_VIRT_BASE);
-}
-
-static void ts72xx_rtc_writebyte(unsigned char value, unsigned long addr)
-{
- __raw_writeb(addr, TS72XX_RTC_INDEX_VIRT_BASE);
- __raw_writeb(value, TS72XX_RTC_DATA_VIRT_BASE);
-}
-
-static struct m48t86_ops ts72xx_rtc_ops = {
- .readbyte = ts72xx_rtc_readbyte,
- .writebyte = ts72xx_rtc_writebyte,
+static struct resource ts72xx_rtc_resources[] = {
+ DEFINE_RES_MEM(TS72XX_RTC_INDEX_PHYS_BASE, 0x01),
+ DEFINE_RES_MEM(TS72XX_RTC_DATA_PHYS_BASE, 0x01),
};
static struct platform_device ts72xx_rtc_device = {
.name = "rtc-m48t86",
.id = -1,
- .dev = {
- .platform_data = &ts72xx_rtc_ops,
- },
- .num_resources = 0,
+ .resource = ts72xx_rtc_resources,
+ .num_resources = ARRAY_SIZE(ts72xx_rtc_resources),
};
static struct resource ts72xx_wdt_resources[] = {
diff --git a/arch/arm/mach-ep93xx/ts72xx.h b/arch/arm/mach-ep93xx/ts72xx.h
index 071feaa30adc..2255ba29fdd6 100644
--- a/arch/arm/mach-ep93xx/ts72xx.h
+++ b/arch/arm/mach-ep93xx/ts72xx.h
@@ -9,8 +9,6 @@
* febff000 22000000 4K model number register (bits 0-2)
* febfe000 22400000 4K options register
* febfd000 22800000 4K options register #2
- * febf9000 10800000 4K TS-5620 RTC index register
- * febf8000 11700000 4K TS-5620 RTC data register
*/
#define TS72XX_MODEL_PHYS_BASE 0x22000000
@@ -40,15 +38,6 @@
#define TS72XX_OPTIONS2_TS9420 0x04
#define TS72XX_OPTIONS2_TS9420_BOOT 0x02
-
-#define TS72XX_RTC_INDEX_VIRT_BASE IOMEM(0xfebf9000)
-#define TS72XX_RTC_INDEX_PHYS_BASE 0x10800000
-#define TS72XX_RTC_INDEX_SIZE 0x00001000
-
-#define TS72XX_RTC_DATA_VIRT_BASE IOMEM(0xfebf8000)
-#define TS72XX_RTC_DATA_PHYS_BASE 0x11700000
-#define TS72XX_RTC_DATA_SIZE 0x00001000
-
#define TS72XX_WDT_CONTROL_PHYS_BASE 0x23800000
#define TS72XX_WDT_FEED_PHYS_BASE 0x23c00000
diff --git a/arch/arm/mach-orion5x/ts78xx-setup.c b/arch/arm/mach-orion5x/ts78xx-setup.c
index 8d597267d0c4..7ef80a8304c0 100644
--- a/arch/arm/mach-orion5x/ts78xx-setup.c
+++ b/arch/arm/mach-orion5x/ts78xx-setup.c
@@ -16,7 +16,6 @@
#include <linux/platform_device.h>
#include <linux/mv643xx_eth.h>
#include <linux/ata_platform.h>
-#include <linux/platform_data/rtc-m48t86.h>
#include <linux/mtd/nand.h>
#include <linux/mtd/partitions.h>
#include <linux/timeriomem-rng.h>
@@ -80,79 +79,38 @@ static struct mv_sata_platform_data ts78xx_sata_data = {
/*****************************************************************************
* RTC M48T86 - nicked^Wborrowed from arch/arm/mach-ep93xx/ts72xx.c
****************************************************************************/
-#define TS_RTC_CTRL (TS78XX_FPGA_REGS_VIRT_BASE + 0x808)
-#define TS_RTC_DATA (TS78XX_FPGA_REGS_VIRT_BASE + 0x80c)
+#define TS_RTC_CTRL (TS78XX_FPGA_REGS_PHYS_BASE + 0x808)
+#define TS_RTC_DATA (TS78XX_FPGA_REGS_PHYS_BASE + 0x80c)
-static unsigned char ts78xx_ts_rtc_readbyte(unsigned long addr)
-{
- writeb(addr, TS_RTC_CTRL);
- return readb(TS_RTC_DATA);
-}
-
-static void ts78xx_ts_rtc_writebyte(unsigned char value, unsigned long addr)
-{
- writeb(addr, TS_RTC_CTRL);
- writeb(value, TS_RTC_DATA);
-}
-
-static struct m48t86_ops ts78xx_ts_rtc_ops = {
- .readbyte = ts78xx_ts_rtc_readbyte,
- .writebyte = ts78xx_ts_rtc_writebyte,
+static struct resource ts78xx_ts_rtc_resources[] = {
+ DEFINE_RES_MEM(TS_RTC_CTRL, 0x01),
+ DEFINE_RES_MEM(TS_RTC_DATA, 0x01),
};
static struct platform_device ts78xx_ts_rtc_device = {
.name = "rtc-m48t86",
.id = -1,
- .dev = {
- .platform_data = &ts78xx_ts_rtc_ops,
- },
- .num_resources = 0,
+ .resource = ts78xx_ts_rtc_resources,
+ .num_resources = ARRAY_SIZE(ts78xx_ts_rtc_resources),
};
-/*
- * TS uses some of the user storage space on the RTC chip so see if it is
- * present; as it's an optional feature at purchase time and not all boards
- * will have it present
- *
- * I've used the method TS use in their rtc7800.c example for the detection
- *
- * TODO: track down a guinea pig without an RTC to see if we can work out a
- * better RTC detection routine
- */
static int ts78xx_ts_rtc_load(void)
{
int rc;
- unsigned char tmp_rtc0, tmp_rtc1;
-
- tmp_rtc0 = ts78xx_ts_rtc_readbyte(126);
- tmp_rtc1 = ts78xx_ts_rtc_readbyte(127);
-
- ts78xx_ts_rtc_writebyte(0x00, 126);
- ts78xx_ts_rtc_writebyte(0x55, 127);
- if (ts78xx_ts_rtc_readbyte(127) == 0x55) {
- ts78xx_ts_rtc_writebyte(0xaa, 127);
- if (ts78xx_ts_rtc_readbyte(127) == 0xaa
- && ts78xx_ts_rtc_readbyte(126) == 0x00) {
- ts78xx_ts_rtc_writebyte(tmp_rtc0, 126);
- ts78xx_ts_rtc_writebyte(tmp_rtc1, 127);
-
- if (ts78xx_fpga.supports.ts_rtc.init == 0) {
- rc = platform_device_register(&ts78xx_ts_rtc_device);
- if (!rc)
- ts78xx_fpga.supports.ts_rtc.init = 1;
- } else
- rc = platform_device_add(&ts78xx_ts_rtc_device);
-
- if (rc)
- pr_info("RTC could not be registered: %d\n",
- rc);
- return rc;
- }
+
+ if (ts78xx_fpga.supports.ts_rtc.init == 0) {
+ rc = platform_device_register(&ts78xx_ts_rtc_device);
+ if (!rc)
+ ts78xx_fpga.supports.ts_rtc.init = 1;
+ } else {
+ rc = platform_device_add(&ts78xx_ts_rtc_device);
}
- pr_info("RTC not found\n");
- return -ENODEV;
-};
+ if (rc)
+ pr_info("RTC could not be registered: %d\n", rc);
+
+ return rc;
+}
static void ts78xx_ts_rtc_unload(void)
{
diff --git a/arch/m68k/configs/amcore_defconfig b/arch/m68k/configs/amcore_defconfig
index f108dd121e9a..131b4101ae5d 100644
--- a/arch/m68k/configs/amcore_defconfig
+++ b/arch/m68k/configs/amcore_defconfig
@@ -1,19 +1,20 @@
-CONFIG_LOCALVERSION="amcore-001"
+CONFIG_LOCALVERSION="amcore-002"
CONFIG_DEFAULT_HOSTNAME="amcore"
CONFIG_SYSVIPC=y
# CONFIG_FHANDLE is not set
# CONFIG_USELIB is not set
CONFIG_LOG_BUF_SHIFT=14
-CONFIG_NAMESPACES=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
# CONFIG_AIO is not set
# CONFIG_ADVISE_SYSCALLS is not set
# CONFIG_MEMBARRIER is not set
CONFIG_EMBEDDED=y
# CONFIG_VM_EVENT_COUNTERS is not set
+# CONFIG_SLUB_DEBUG is not set
# CONFIG_COMPAT_BRK is not set
# CONFIG_LBDAF is not set
# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_IOSCHED_CFQ is not set
# CONFIG_MMU is not set
CONFIG_M5307=y
CONFIG_AMCORE=y
@@ -27,13 +28,14 @@ CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
CONFIG_INET=y
+CONFIG_SYN_COOKIES=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
# CONFIG_IPV6 is not set
# CONFIG_WIRELESS is not set
# CONFIG_UEVENT_HELPER is not set
-CONFIG_FW_LOADER_USER_HELPER_FALLBACK=y
+# CONFIG_FW_LOADER is not set
# CONFIG_ALLOW_DEV_COREDUMP is not set
CONFIG_CONNECTOR=y
CONFIG_MTD=y
@@ -53,6 +55,7 @@ CONFIG_MTD_UCLINUX=y
CONFIG_MTD_PLATRAM=y
CONFIG_BLK_DEV_RAM=y
CONFIG_NETDEVICES=y
+# CONFIG_NET_VENDOR_AMAZON is not set
# CONFIG_NET_VENDOR_ARC is not set
# CONFIG_NET_CADENCE is not set
# CONFIG_NET_VENDOR_BROADCOM is not set
@@ -89,14 +92,12 @@ CONFIG_I2C=y
CONFIG_I2C_CHARDEV=y
# CONFIG_I2C_HELPER_AUTO is not set
CONFIG_I2C_IMX=y
-CONFIG_PPS=y
+CONFIG_GPIO_SYSFS=y
# CONFIG_HWMON is not set
# CONFIG_USB_SUPPORT is not set
CONFIG_RTC_CLASS=y
# CONFIG_RTC_SYSTOHC is not set
CONFIG_RTC_DRV_DS1307=y
-CONFIG_EXT2_FS=y
-CONFIG_EXT2_FS_XATTR=y
# CONFIG_FILE_LOCKING is not set
# CONFIG_DNOTIFY is not set
# CONFIG_INOTIFY_USER is not set
@@ -108,6 +109,7 @@ CONFIG_ROMFS_BACKED_BY_BOTH=y
# CONFIG_NETWORK_FILESYSTEMS is not set
CONFIG_PRINTK_TIME=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
CONFIG_PANIC_ON_OOPS=y
# CONFIG_SCHED_DEBUG is not set
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index d5c1073a2584..a2dcef0aacc7 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -134,6 +134,7 @@ config S390
select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES
select HAVE_CMPXCHG_DOUBLE
select HAVE_CMPXCHG_LOCAL
+ select HAVE_COPY_THREAD_TLS
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_API_DEBUG
select HAVE_DMA_CONTIGUOUS
diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig
index e00975361fec..143b1e00b818 100644
--- a/arch/s390/configs/default_defconfig
+++ b/arch/s390/configs/default_defconfig
@@ -678,6 +678,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
CONFIG_ZCRYPT=m
+CONFIG_PKEY=m
CONFIG_CRYPTO_SHA1_S390=m
CONFIG_CRYPTO_SHA256_S390=m
CONFIG_CRYPTO_SHA512_S390=m
diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig
index 2cf87343b590..2358bf33c5ef 100644
--- a/arch/s390/configs/performance_defconfig
+++ b/arch/s390/configs/performance_defconfig
@@ -628,6 +628,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
CONFIG_ZCRYPT=m
+CONFIG_PKEY=m
CONFIG_CRYPTO_SHA1_S390=m
CONFIG_CRYPTO_SHA256_S390=m
CONFIG_CRYPTO_SHA512_S390=m
diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index d1033de4c4ee..402c530c6da5 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_CRYPTO_SHA1_S390) += sha1_s390.o sha_common.o
obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256_s390.o sha_common.o
obj-$(CONFIG_CRYPTO_SHA512_S390) += sha512_s390.o sha_common.o
obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o
-obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o
+obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o paes_s390.o
obj-$(CONFIG_S390_PRNG) += prng.o
obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o
obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o
diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c
new file mode 100644
index 000000000000..d69ea495c4d7
--- /dev/null
+++ b/arch/s390/crypto/paes_s390.c
@@ -0,0 +1,619 @@
+/*
+ * Cryptographic API.
+ *
+ * s390 implementation of the AES Cipher Algorithm with protected keys.
+ *
+ * s390 Version:
+ * Copyright IBM Corp. 2017
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ * Harald Freudenberger <freude@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ */
+
+#define KMSG_COMPONENT "paes_s390"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <crypto/aes.h>
+#include <crypto/algapi.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <crypto/xts.h>
+#include <asm/cpacf.h>
+#include <asm/pkey.h>
+
+static u8 *ctrblk;
+static DEFINE_SPINLOCK(ctrblk_lock);
+
+static cpacf_mask_t km_functions, kmc_functions, kmctr_functions;
+
+struct s390_paes_ctx {
+ struct pkey_seckey sk;
+ struct pkey_protkey pk;
+ unsigned long fc;
+};
+
+struct s390_pxts_ctx {
+ struct pkey_seckey sk[2];
+ struct pkey_protkey pk[2];
+ unsigned long fc;
+};
+
+static inline int __paes_convert_key(struct pkey_seckey *sk,
+ struct pkey_protkey *pk)
+{
+ int i, ret;
+
+ /* try three times in case of failure */
+ for (i = 0; i < 3; i++) {
+ ret = pkey_skey2pkey(sk, pk);
+ if (ret == 0)
+ break;
+ }
+
+ return ret;
+}
+
+static int __paes_set_key(struct s390_paes_ctx *ctx)
+{
+ unsigned long fc;
+
+ if (__paes_convert_key(&ctx->sk, &ctx->pk))
+ return -EINVAL;
+
+ /* Pick the correct function code based on the protected key type */
+ fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PAES_128 :
+ (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KM_PAES_192 :
+ (ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KM_PAES_256 : 0;
+
+ /* Check if the function code is available */
+ ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0;
+
+ return ctx->fc ? 0 : -EINVAL;
+}
+
+static int ecb_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct s390_paes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ if (key_len != SECKEYBLOBSIZE)
+ return -EINVAL;
+
+ memcpy(ctx->sk.seckey, in_key, SECKEYBLOBSIZE);
+ if (__paes_set_key(ctx)) {
+ tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int ecb_paes_crypt(struct blkcipher_desc *desc,
+ unsigned long modifier,
+ struct blkcipher_walk *walk)
+{
+ struct s390_paes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ unsigned int nbytes, n, k;
+ int ret;
+
+ ret = blkcipher_walk_virt(desc, walk);
+ while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) {
+ /* only use complete blocks */
+ n = nbytes & ~(AES_BLOCK_SIZE - 1);
+ k = cpacf_km(ctx->fc | modifier, ctx->pk.protkey,
+ walk->dst.virt.addr, walk->src.virt.addr, n);
+ if (k)
+ ret = blkcipher_walk_done(desc, walk, nbytes - k);
+ if (k < n) {
+ if (__paes_set_key(ctx) != 0)
+ return blkcipher_walk_done(desc, walk, -EIO);
+ }
+ }
+ return ret;
+}
+
+static int ecb_paes_encrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst, struct scatterlist *src,
+ unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return ecb_paes_crypt(desc, CPACF_ENCRYPT, &walk);
+}
+
+static int ecb_paes_decrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst, struct scatterlist *src,
+ unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return ecb_paes_crypt(desc, CPACF_DECRYPT, &walk);
+}
+
+static struct crypto_alg ecb_paes_alg = {
+ .cra_name = "ecb(paes)",
+ .cra_driver_name = "ecb-paes-s390",
+ .cra_priority = 400, /* combo: aes + ecb */
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct s390_paes_ctx),
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(ecb_paes_alg.cra_list),
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = SECKEYBLOBSIZE,
+ .max_keysize = SECKEYBLOBSIZE,
+ .setkey = ecb_paes_set_key,
+ .encrypt = ecb_paes_encrypt,
+ .decrypt = ecb_paes_decrypt,
+ }
+ }
+};
+
+static int __cbc_paes_set_key(struct s390_paes_ctx *ctx)
+{
+ unsigned long fc;
+
+ if (__paes_convert_key(&ctx->sk, &ctx->pk))
+ return -EINVAL;
+
+ /* Pick the correct function code based on the protected key type */
+ fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMC_PAES_128 :
+ (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KMC_PAES_192 :
+ (ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KMC_PAES_256 : 0;
+
+ /* Check if the function code is available */
+ ctx->fc = (fc && cpacf_test_func(&kmc_functions, fc)) ? fc : 0;
+
+ return ctx->fc ? 0 : -EINVAL;
+}
+
+static int cbc_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct s390_paes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ memcpy(ctx->sk.seckey, in_key, SECKEYBLOBSIZE);
+ if (__cbc_paes_set_key(ctx)) {
+ tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int cbc_paes_crypt(struct blkcipher_desc *desc, unsigned long modifier,
+ struct blkcipher_walk *walk)
+{
+ struct s390_paes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ unsigned int nbytes, n, k;
+ int ret;
+ struct {
+ u8 iv[AES_BLOCK_SIZE];
+ u8 key[MAXPROTKEYSIZE];
+ } param;
+
+ ret = blkcipher_walk_virt(desc, walk);
+ memcpy(param.iv, walk->iv, AES_BLOCK_SIZE);
+ memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+ while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) {
+ /* only use complete blocks */
+ n = nbytes & ~(AES_BLOCK_SIZE - 1);
+ k = cpacf_kmc(ctx->fc | modifier, &param,
+ walk->dst.virt.addr, walk->src.virt.addr, n);
+ if (k)
+ ret = blkcipher_walk_done(desc, walk, nbytes - k);
+ if (n < k) {
+ if (__cbc_paes_set_key(ctx) != 0)
+ return blkcipher_walk_done(desc, walk, -EIO);
+ memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+ }
+ }
+ memcpy(walk->iv, param.iv, AES_BLOCK_SIZE);
+ return ret;
+}
+
+static int cbc_paes_encrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst, struct scatterlist *src,
+ unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return cbc_paes_crypt(desc, 0, &walk);
+}
+
+static int cbc_paes_decrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst, struct scatterlist *src,
+ unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return cbc_paes_crypt(desc, CPACF_DECRYPT, &walk);
+}
+
+static struct crypto_alg cbc_paes_alg = {
+ .cra_name = "cbc(paes)",
+ .cra_driver_name = "cbc-paes-s390",
+ .cra_priority = 400, /* combo: aes + cbc */
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct s390_paes_ctx),
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(cbc_paes_alg.cra_list),
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = SECKEYBLOBSIZE,
+ .max_keysize = SECKEYBLOBSIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = cbc_paes_set_key,
+ .encrypt = cbc_paes_encrypt,
+ .decrypt = cbc_paes_decrypt,
+ }
+ }
+};
+
+static int __xts_paes_set_key(struct s390_pxts_ctx *ctx)
+{
+ unsigned long fc;
+
+ if (__paes_convert_key(&ctx->sk[0], &ctx->pk[0]) ||
+ __paes_convert_key(&ctx->sk[1], &ctx->pk[1]))
+ return -EINVAL;
+
+ if (ctx->pk[0].type != ctx->pk[1].type)
+ return -EINVAL;
+
+ /* Pick the correct function code based on the protected key type */
+ fc = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PXTS_128 :
+ (ctx->pk[0].type == PKEY_KEYTYPE_AES_256) ?
+ CPACF_KM_PXTS_256 : 0;
+
+ /* Check if the function code is available */
+ ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0;
+
+ return ctx->fc ? 0 : -EINVAL;
+}
+
+static int xts_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct s390_pxts_ctx *ctx = crypto_tfm_ctx(tfm);
+ u8 ckey[2 * AES_MAX_KEY_SIZE];
+ unsigned int ckey_len;
+
+ memcpy(ctx->sk[0].seckey, in_key, SECKEYBLOBSIZE);
+ memcpy(ctx->sk[1].seckey, in_key + SECKEYBLOBSIZE, SECKEYBLOBSIZE);
+ if (__xts_paes_set_key(ctx)) {
+ tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+
+ /*
+ * xts_check_key verifies the key length is not odd and makes
+ * sure that the two keys are not the same. This can be done
+ * on the two protected keys as well
+ */
+ ckey_len = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ?
+ AES_KEYSIZE_128 : AES_KEYSIZE_256;
+ memcpy(ckey, ctx->pk[0].protkey, ckey_len);
+ memcpy(ckey + ckey_len, ctx->pk[1].protkey, ckey_len);
+ return xts_check_key(tfm, ckey, 2*ckey_len);
+}
+
+static int xts_paes_crypt(struct blkcipher_desc *desc, unsigned long modifier,
+ struct blkcipher_walk *walk)
+{
+ struct s390_pxts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ unsigned int keylen, offset, nbytes, n, k;
+ int ret;
+ struct {
+ u8 key[MAXPROTKEYSIZE]; /* key + verification pattern */
+ u8 tweak[16];
+ u8 block[16];
+ u8 bit[16];
+ u8 xts[16];
+ } pcc_param;
+ struct {
+ u8 key[MAXPROTKEYSIZE]; /* key + verification pattern */
+ u8 init[16];
+ } xts_param;
+
+ ret = blkcipher_walk_virt(desc, walk);
+ keylen = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 48 : 64;
+ offset = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 16 : 0;
+retry:
+ memset(&pcc_param, 0, sizeof(pcc_param));
+ memcpy(pcc_param.tweak, walk->iv, sizeof(pcc_param.tweak));
+ memcpy(pcc_param.key + offset, ctx->pk[1].protkey, keylen);
+ cpacf_pcc(ctx->fc, pcc_param.key + offset);
+
+ memcpy(xts_param.key + offset, ctx->pk[0].protkey, keylen);
+ memcpy(xts_param.init, pcc_param.xts, 16);
+
+ while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) {
+ /* only use complete blocks */
+ n = nbytes & ~(AES_BLOCK_SIZE - 1);
+ k = cpacf_km(ctx->fc | modifier, xts_param.key + offset,
+ walk->dst.virt.addr, walk->src.virt.addr, n);
+ if (k)
+ ret = blkcipher_walk_done(desc, walk, nbytes - k);
+ if (k < n) {
+ if (__xts_paes_set_key(ctx) != 0)
+ return blkcipher_walk_done(desc, walk, -EIO);
+ goto retry;
+ }
+ }
+ return ret;
+}
+
+static int xts_paes_encrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst, struct scatterlist *src,
+ unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return xts_paes_crypt(desc, 0, &walk);
+}
+
+static int xts_paes_decrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst, struct scatterlist *src,
+ unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return xts_paes_crypt(desc, CPACF_DECRYPT, &walk);
+}
+
+static struct crypto_alg xts_paes_alg = {
+ .cra_name = "xts(paes)",
+ .cra_driver_name = "xts-paes-s390",
+ .cra_priority = 400, /* combo: aes + xts */
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct s390_pxts_ctx),
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(xts_paes_alg.cra_list),
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = 2 * SECKEYBLOBSIZE,
+ .max_keysize = 2 * SECKEYBLOBSIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = xts_paes_set_key,
+ .encrypt = xts_paes_encrypt,
+ .decrypt = xts_paes_decrypt,
+ }
+ }
+};
+
+static int __ctr_paes_set_key(struct s390_paes_ctx *ctx)
+{
+ unsigned long fc;
+
+ if (__paes_convert_key(&ctx->sk, &ctx->pk))
+ return -EINVAL;
+
+ /* Pick the correct function code based on the protected key type */
+ fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMCTR_PAES_128 :
+ (ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KMCTR_PAES_192 :
+ (ctx->pk.type == PKEY_KEYTYPE_AES_256) ?
+ CPACF_KMCTR_PAES_256 : 0;
+
+ /* Check if the function code is available */
+ ctx->fc = (fc && cpacf_test_func(&kmctr_functions, fc)) ? fc : 0;
+
+ return ctx->fc ? 0 : -EINVAL;
+}
+
+static int ctr_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct s390_paes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ memcpy(ctx->sk.seckey, in_key, key_len);
+ if (__ctr_paes_set_key(ctx)) {
+ tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes)
+{
+ unsigned int i, n;
+
+ /* only use complete blocks, max. PAGE_SIZE */
+ memcpy(ctrptr, iv, AES_BLOCK_SIZE);
+ n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(AES_BLOCK_SIZE - 1);
+ for (i = (n / AES_BLOCK_SIZE) - 1; i > 0; i--) {
+ memcpy(ctrptr + AES_BLOCK_SIZE, ctrptr, AES_BLOCK_SIZE);
+ crypto_inc(ctrptr + AES_BLOCK_SIZE, AES_BLOCK_SIZE);
+ ctrptr += AES_BLOCK_SIZE;
+ }
+ return n;
+}
+
+static int ctr_paes_crypt(struct blkcipher_desc *desc, unsigned long modifier,
+ struct blkcipher_walk *walk)
+{
+ struct s390_paes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ u8 buf[AES_BLOCK_SIZE], *ctrptr;
+ unsigned int nbytes, n, k;
+ int ret, locked;
+
+ locked = spin_trylock(&ctrblk_lock);
+
+ ret = blkcipher_walk_virt_block(desc, walk, AES_BLOCK_SIZE);
+ while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) {
+ n = AES_BLOCK_SIZE;
+ if (nbytes >= 2*AES_BLOCK_SIZE && locked)
+ n = __ctrblk_init(ctrblk, walk->iv, nbytes);
+ ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk->iv;
+ k = cpacf_kmctr(ctx->fc | modifier, ctx->pk.protkey,
+ walk->dst.virt.addr, walk->src.virt.addr,
+ n, ctrptr);
+ if (k) {
+ if (ctrptr == ctrblk)
+ memcpy(walk->iv, ctrptr + k - AES_BLOCK_SIZE,
+ AES_BLOCK_SIZE);
+ crypto_inc(walk->iv, AES_BLOCK_SIZE);
+ ret = blkcipher_walk_done(desc, walk, nbytes - n);
+ }
+ if (k < n) {
+ if (__ctr_paes_set_key(ctx) != 0)
+ return blkcipher_walk_done(desc, walk, -EIO);
+ }
+ }
+ if (locked)
+ spin_unlock(&ctrblk_lock);
+ /*
+ * final block may be < AES_BLOCK_SIZE, copy only nbytes
+ */
+ if (nbytes) {
+ while (1) {
+ if (cpacf_kmctr(ctx->fc | modifier,
+ ctx->pk.protkey, buf,
+ walk->src.virt.addr, AES_BLOCK_SIZE,
+ walk->iv) == AES_BLOCK_SIZE)
+ break;
+ if (__ctr_paes_set_key(ctx) != 0)
+ return blkcipher_walk_done(desc, walk, -EIO);
+ }
+ memcpy(walk->dst.virt.addr, buf, nbytes);
+ crypto_inc(walk->iv, AES_BLOCK_SIZE);
+ ret = blkcipher_walk_done(desc, walk, 0);
+ }
+
+ return ret;
+}
+
+static int ctr_paes_encrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst, struct scatterlist *src,
+ unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return ctr_paes_crypt(desc, 0, &walk);
+}
+
+static int ctr_paes_decrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst, struct scatterlist *src,
+ unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return ctr_paes_crypt(desc, CPACF_DECRYPT, &walk);
+}
+
+static struct crypto_alg ctr_paes_alg = {
+ .cra_name = "ctr(paes)",
+ .cra_driver_name = "ctr-paes-s390",
+ .cra_priority = 400, /* combo: aes + ctr */
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct s390_paes_ctx),
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(ctr_paes_alg.cra_list),
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = SECKEYBLOBSIZE,
+ .max_keysize = SECKEYBLOBSIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ctr_paes_set_key,
+ .encrypt = ctr_paes_encrypt,
+ .decrypt = ctr_paes_decrypt,
+ }
+ }
+};
+
+static inline void __crypto_unregister_alg(struct crypto_alg *alg)
+{
+ if (!list_empty(&alg->cra_list))
+ crypto_unregister_alg(alg);
+}
+
+static void paes_s390_fini(void)
+{
+ if (ctrblk)
+ free_page((unsigned long) ctrblk);
+ __crypto_unregister_alg(&ctr_paes_alg);
+ __crypto_unregister_alg(&xts_paes_alg);
+ __crypto_unregister_alg(&cbc_paes_alg);
+ __crypto_unregister_alg(&ecb_paes_alg);
+}
+
+static int __init paes_s390_init(void)
+{
+ int ret;
+
+ /* Query available functions for KM, KMC and KMCTR */
+ cpacf_query(CPACF_KM, &km_functions);
+ cpacf_query(CPACF_KMC, &kmc_functions);
+ cpacf_query(CPACF_KMCTR, &kmctr_functions);
+
+ if (cpacf_test_func(&km_functions, CPACF_KM_PAES_128) ||
+ cpacf_test_func(&km_functions, CPACF_KM_PAES_192) ||
+ cpacf_test_func(&km_functions, CPACF_KM_PAES_256)) {
+ ret = crypto_register_alg(&ecb_paes_alg);
+ if (ret)
+ goto out_err;
+ }
+
+ if (cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_128) ||
+ cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_192) ||
+ cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_256)) {
+ ret = crypto_register_alg(&cbc_paes_alg);
+ if (ret)
+ goto out_err;
+ }
+
+ if (cpacf_test_func(&km_functions, CPACF_KM_PXTS_128) ||
+ cpacf_test_func(&km_functions, CPACF_KM_PXTS_256)) {
+ ret = crypto_register_alg(&xts_paes_alg);
+ if (ret)
+ goto out_err;
+ }
+
+ if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_128) ||
+ cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_192) ||
+ cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_256)) {
+ ret = crypto_register_alg(&ctr_paes_alg);
+ if (ret)
+ goto out_err;
+ ctrblk = (u8 *) __get_free_page(GFP_KERNEL);
+ if (!ctrblk) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ }
+
+ return 0;
+out_err:
+ paes_s390_fini();
+ return ret;
+}
+
+module_init(paes_s390_init);
+module_exit(paes_s390_fini);
+
+MODULE_ALIAS_CRYPTO("aes-all");
+
+MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm with protected keys");
+MODULE_LICENSE("GPL");
diff --git a/arch/s390/defconfig b/arch/s390/defconfig
index d00e368fb5e6..68bfd09f1b02 100644
--- a/arch/s390/defconfig
+++ b/arch/s390/defconfig
@@ -229,6 +229,7 @@ CONFIG_CRYPTO_USER_API_HASH=m
CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_ZCRYPT=m
+CONFIG_PKEY=m
CONFIG_CRYPTO_SHA1_S390=m
CONFIG_CRYPTO_SHA256_S390=m
CONFIG_CRYPTO_SHA512_S390=m
diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index 2c680db7e5c1..e2dfbf280d12 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -28,8 +28,9 @@
#define CPACF_PPNO 0xb93c /* MSA5 */
/*
- * Decryption modifier bit
+ * En/decryption modifier bits
*/
+#define CPACF_ENCRYPT 0x00
#define CPACF_DECRYPT 0x80
/*
@@ -42,8 +43,13 @@
#define CPACF_KM_AES_128 0x12
#define CPACF_KM_AES_192 0x13
#define CPACF_KM_AES_256 0x14
+#define CPACF_KM_PAES_128 0x1a
+#define CPACF_KM_PAES_192 0x1b
+#define CPACF_KM_PAES_256 0x1c
#define CPACF_KM_XTS_128 0x32
#define CPACF_KM_XTS_256 0x34
+#define CPACF_KM_PXTS_128 0x3a
+#define CPACF_KM_PXTS_256 0x3c
/*
* Function codes for the KMC (CIPHER MESSAGE WITH CHAINING)
@@ -56,6 +62,9 @@
#define CPACF_KMC_AES_128 0x12
#define CPACF_KMC_AES_192 0x13
#define CPACF_KMC_AES_256 0x14
+#define CPACF_KMC_PAES_128 0x1a
+#define CPACF_KMC_PAES_192 0x1b
+#define CPACF_KMC_PAES_256 0x1c
#define CPACF_KMC_PRNG 0x43
/*
@@ -69,6 +78,9 @@
#define CPACF_KMCTR_AES_128 0x12
#define CPACF_KMCTR_AES_192 0x13
#define CPACF_KMCTR_AES_256 0x14
+#define CPACF_KMCTR_PAES_128 0x1a
+#define CPACF_KMCTR_PAES_192 0x1b
+#define CPACF_KMCTR_PAES_256 0x1c
/*
* Function codes for the KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST)
@@ -99,6 +111,18 @@
#define CPACF_KMAC_TDEA_192 0x03
/*
+ * Function codes for the PCKMO (PERFORM CRYPTOGRAPHIC KEY MANAGEMENT)
+ * instruction
+ */
+#define CPACF_PCKMO_QUERY 0x00
+#define CPACF_PCKMO_ENC_DES_KEY 0x01
+#define CPACF_PCKMO_ENC_TDES_128_KEY 0x02
+#define CPACF_PCKMO_ENC_TDES_192_KEY 0x03
+#define CPACF_PCKMO_ENC_AES_128_KEY 0x12
+#define CPACF_PCKMO_ENC_AES_192_KEY 0x13
+#define CPACF_PCKMO_ENC_AES_256_KEY 0x14
+
+/*
* Function codes for the PPNO (PERFORM PSEUDORANDOM NUMBER OPERATION)
* instruction
*/
@@ -397,4 +421,24 @@ static inline void cpacf_pcc(unsigned long func, void *param)
: "cc", "memory");
}
+/**
+ * cpacf_pckmo() - executes the PCKMO (PERFORM CRYPTOGRAPHIC KEY
+ * MANAGEMENT) instruction
+ * @func: the function code passed to PCKMO; see CPACF_PCKMO_xxx defines
+ * @param: address of parameter block; see POP for details on each func
+ *
+ * Returns 0.
+ */
+static inline void cpacf_pckmo(long func, void *param)
+{
+ register unsigned long r0 asm("0") = (unsigned long) func;
+ register unsigned long r1 asm("1") = (unsigned long) param;
+
+ asm volatile(
+ " .insn rre,%[opc] << 16,0,0\n" /* PCKMO opcode */
+ :
+ : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_PCKMO)
+ : "cc", "memory");
+}
+
#endif /* _ASM_S390_CPACF_H */
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 67f7a991c929..9b828c073176 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -63,7 +63,7 @@ static inline void set_user_asce(struct mm_struct *mm)
S390_lowcore.user_asce = mm->context.asce;
if (current->thread.mm_segment.ar4)
__ctl_load(S390_lowcore.user_asce, 7, 7);
- set_cpu_flag(CIF_ASCE);
+ set_cpu_flag(CIF_ASCE_PRIMARY);
}
static inline void clear_user_asce(void)
@@ -81,7 +81,7 @@ static inline void load_kernel_asce(void)
__ctl_store(asce, 1, 1);
if (asce != S390_lowcore.kernel_asce)
__ctl_load(S390_lowcore.kernel_asce, 1, 1);
- set_cpu_flag(CIF_ASCE);
+ set_cpu_flag(CIF_ASCE_PRIMARY);
}
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 52511866fb14..7ed1972b1920 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -640,12 +640,12 @@ static inline int pud_bad(pud_t pud)
static inline int pmd_present(pmd_t pmd)
{
- return pmd_val(pmd) != _SEGMENT_ENTRY_INVALID;
+ return pmd_val(pmd) != _SEGMENT_ENTRY_EMPTY;
}
static inline int pmd_none(pmd_t pmd)
{
- return pmd_val(pmd) == _SEGMENT_ENTRY_INVALID;
+ return pmd_val(pmd) == _SEGMENT_ENTRY_EMPTY;
}
static inline unsigned long pmd_pfn(pmd_t pmd)
@@ -803,7 +803,7 @@ static inline void pud_clear(pud_t *pud)
static inline void pmd_clear(pmd_t *pmdp)
{
- pmd_val(*pmdp) = _SEGMENT_ENTRY_INVALID;
+ pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
}
static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -1357,7 +1357,7 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
- return pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_INVALID));
+ return pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
}
#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
@@ -1367,10 +1367,10 @@ static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
{
if (full) {
pmd_t pmd = *pmdp;
- *pmdp = __pmd(_SEGMENT_ENTRY_INVALID);
+ *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
return pmd;
}
- return pmdp_xchg_lazy(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_INVALID));
+ return pmdp_xchg_lazy(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
}
#define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
@@ -1384,7 +1384,7 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
static inline void pmdp_invalidate(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
- pmdp_xchg_direct(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_INVALID));
+ pmdp_xchg_direct(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
}
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
diff --git a/arch/s390/include/asm/pkey.h b/arch/s390/include/asm/pkey.h
new file mode 100644
index 000000000000..b48aef4188f6
--- /dev/null
+++ b/arch/s390/include/asm/pkey.h
@@ -0,0 +1,90 @@
+/*
+ * Kernelspace interface to the pkey device driver
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author: Harald Freudenberger <freude@de.ibm.com>
+ *
+ */
+
+#ifndef _KAPI_PKEY_H
+#define _KAPI_PKEY_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+#include <uapi/asm/pkey.h>
+
+/*
+ * Generate (AES) random secure key.
+ * @param cardnr may be -1 (use default card)
+ * @param domain may be -1 (use default domain)
+ * @param keytype one of the PKEY_KEYTYPE values
+ * @param seckey pointer to buffer receiving the secure key
+ * @return 0 on success, negative errno value on failure
+ */
+int pkey_genseckey(__u16 cardnr, __u16 domain,
+ __u32 keytype, struct pkey_seckey *seckey);
+
+/*
+ * Generate (AES) secure key with given key value.
+ * @param cardnr may be -1 (use default card)
+ * @param domain may be -1 (use default domain)
+ * @param keytype one of the PKEY_KEYTYPE values
+ * @param clrkey pointer to buffer with clear key data
+ * @param seckey pointer to buffer receiving the secure key
+ * @return 0 on success, negative errno value on failure
+ */
+int pkey_clr2seckey(__u16 cardnr, __u16 domain, __u32 keytype,
+ const struct pkey_clrkey *clrkey,
+ struct pkey_seckey *seckey);
+
+/*
+ * Derive (AES) proteced key from the (AES) secure key blob.
+ * @param cardnr may be -1 (use default card)
+ * @param domain may be -1 (use default domain)
+ * @param seckey pointer to buffer with the input secure key
+ * @param protkey pointer to buffer receiving the protected key and
+ * additional info (type, length)
+ * @return 0 on success, negative errno value on failure
+ */
+int pkey_sec2protkey(__u16 cardnr, __u16 domain,
+ const struct pkey_seckey *seckey,
+ struct pkey_protkey *protkey);
+
+/*
+ * Derive (AES) protected key from a given clear key value.
+ * @param keytype one of the PKEY_KEYTYPE values
+ * @param clrkey pointer to buffer with clear key data
+ * @param protkey pointer to buffer receiving the protected key and
+ * additional info (type, length)
+ * @return 0 on success, negative errno value on failure
+ */
+int pkey_clr2protkey(__u32 keytype,
+ const struct pkey_clrkey *clrkey,
+ struct pkey_protkey *protkey);
+
+/*
+ * Search for a matching crypto card based on the Master Key
+ * Verification Pattern provided inside a secure key.
+ * @param seckey pointer to buffer with the input secure key
+ * @param cardnr pointer to cardnr, receives the card number on success
+ * @param domain pointer to domain, receives the domain number on success
+ * @param verify if set, always verify by fetching verification pattern
+ * from card
+ * @return 0 on success, negative errno value on failure. If no card could be
+ * found, -ENODEV is returned.
+ */
+int pkey_findcard(const struct pkey_seckey *seckey,
+ __u16 *cardnr, __u16 *domain, int verify);
+
+/*
+ * Find card and transform secure key to protected key.
+ * @param seckey pointer to buffer with the input secure key
+ * @param protkey pointer to buffer receiving the protected key and
+ * additional info (type, length)
+ * @return 0 on success, negative errno value on failure
+ */
+int pkey_skey2pkey(const struct pkey_seckey *seckey,
+ struct pkey_protkey *protkey);
+
+#endif /* _KAPI_PKEY_H */
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index dacba341e475..e4988710aa86 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -14,14 +14,16 @@
#include <linux/const.h>
#define CIF_MCCK_PENDING 0 /* machine check handling is pending */
-#define CIF_ASCE 1 /* user asce needs fixup / uaccess */
-#define CIF_NOHZ_DELAY 2 /* delay HZ disable for a tick */
-#define CIF_FPU 3 /* restore FPU registers */
-#define CIF_IGNORE_IRQ 4 /* ignore interrupt (for udelay) */
-#define CIF_ENABLED_WAIT 5 /* in enabled wait state */
+#define CIF_ASCE_PRIMARY 1 /* primary asce needs fixup / uaccess */
+#define CIF_ASCE_SECONDARY 2 /* secondary asce needs fixup / uaccess */
+#define CIF_NOHZ_DELAY 3 /* delay HZ disable for a tick */
+#define CIF_FPU 4 /* restore FPU registers */
+#define CIF_IGNORE_IRQ 5 /* ignore interrupt (for udelay) */
+#define CIF_ENABLED_WAIT 6 /* in enabled wait state */
#define _CIF_MCCK_PENDING _BITUL(CIF_MCCK_PENDING)
-#define _CIF_ASCE _BITUL(CIF_ASCE)
+#define _CIF_ASCE_PRIMARY _BITUL(CIF_ASCE_PRIMARY)
+#define _CIF_ASCE_SECONDARY _BITUL(CIF_ASCE_SECONDARY)
#define _CIF_NOHZ_DELAY _BITUL(CIF_NOHZ_DELAY)
#define _CIF_FPU _BITUL(CIF_FPU)
#define _CIF_IGNORE_IRQ _BITUL(CIF_IGNORE_IRQ)
@@ -89,7 +91,8 @@ extern void execve_tail(void);
* User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit.
*/
-#define TASK_SIZE_OF(tsk) ((tsk)->mm->context.asce_limit)
+#define TASK_SIZE_OF(tsk) ((tsk)->mm ? \
+ (tsk)->mm->context.asce_limit : TASK_MAX_SIZE)
#define TASK_UNMAPPED_BASE (test_thread_flag(TIF_31BIT) ? \
(1UL << 30) : (1UL << 41))
#define TASK_SIZE TASK_SIZE_OF(current)
@@ -200,10 +203,12 @@ struct stack_frame {
struct task_struct;
struct mm_struct;
struct seq_file;
+struct pt_regs;
typedef int (*dump_trace_func_t)(void *data, unsigned long address, int reliable);
void dump_trace(dump_trace_func_t func, void *data,
struct task_struct *task, unsigned long sp);
+void show_registers(struct pt_regs *regs);
void show_cacheinfo(struct seq_file *m);
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index b2988fc60f65..136932ff4250 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -14,6 +14,7 @@
*/
#include <linux/sched.h>
#include <linux/errno.h>
+#include <asm/processor.h>
#include <asm/ctl_reg.h>
#define VERIFY_READ 0
@@ -36,18 +37,20 @@
#define get_ds() (KERNEL_DS)
#define get_fs() (current->thread.mm_segment)
-
-#define set_fs(x) \
-do { \
- unsigned long __pto; \
- current->thread.mm_segment = (x); \
- __pto = current->thread.mm_segment.ar4 ? \
- S390_lowcore.user_asce : S390_lowcore.kernel_asce; \
- __ctl_load(__pto, 7, 7); \
-} while (0)
-
#define segment_eq(a,b) ((a).ar4 == (b).ar4)
+static inline void set_fs(mm_segment_t fs)
+{
+ current->thread.mm_segment = fs;
+ if (segment_eq(fs, KERNEL_DS)) {
+ set_cpu_flag(CIF_ASCE_SECONDARY);
+ __ctl_load(S390_lowcore.kernel_asce, 7, 7);
+ } else {
+ clear_cpu_flag(CIF_ASCE_SECONDARY);
+ __ctl_load(S390_lowcore.user_asce, 7, 7);
+ }
+}
+
static inline int __range_ok(unsigned long addr, unsigned long size)
{
return 1;
diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild
index bf736e764cb4..6848ba5c1454 100644
--- a/arch/s390/include/uapi/asm/Kbuild
+++ b/arch/s390/include/uapi/asm/Kbuild
@@ -24,6 +24,7 @@ header-y += mman.h
header-y += monwriter.h
header-y += msgbuf.h
header-y += param.h
+header-y += pkey.h
header-y += poll.h
header-y += posix_types.h
header-y += ptrace.h
diff --git a/arch/s390/include/uapi/asm/pkey.h b/arch/s390/include/uapi/asm/pkey.h
new file mode 100644
index 000000000000..ed7f19c27ce5
--- /dev/null
+++ b/arch/s390/include/uapi/asm/pkey.h
@@ -0,0 +1,112 @@
+/*
+ * Userspace interface to the pkey device driver
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author: Harald Freudenberger <freude@de.ibm.com>
+ *
+ */
+
+#ifndef _UAPI_PKEY_H
+#define _UAPI_PKEY_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/*
+ * Ioctl calls supported by the pkey device driver
+ */
+
+#define PKEY_IOCTL_MAGIC 'p'
+
+#define SECKEYBLOBSIZE 64 /* secure key blob size is always 64 bytes */
+#define MAXPROTKEYSIZE 64 /* a protected key blob may be up to 64 bytes */
+#define MAXCLRKEYSIZE 32 /* a clear key value may be up to 32 bytes */
+
+/* defines for the type field within the pkey_protkey struct */
+#define PKEY_KEYTYPE_AES_128 1
+#define PKEY_KEYTYPE_AES_192 2
+#define PKEY_KEYTYPE_AES_256 3
+
+/* Struct to hold a secure key blob */
+struct pkey_seckey {
+ __u8 seckey[SECKEYBLOBSIZE]; /* the secure key blob */
+};
+
+/* Struct to hold protected key and length info */
+struct pkey_protkey {
+ __u32 type; /* key type, one of the PKEY_KEYTYPE values */
+ __u32 len; /* bytes actually stored in protkey[] */
+ __u8 protkey[MAXPROTKEYSIZE]; /* the protected key blob */
+};
+
+/* Struct to hold a clear key value */
+struct pkey_clrkey {
+ __u8 clrkey[MAXCLRKEYSIZE]; /* 16, 24, or 32 byte clear key value */
+};
+
+/*
+ * Generate secure key
+ */
+struct pkey_genseck {
+ __u16 cardnr; /* in: card to use or FFFF for any */
+ __u16 domain; /* in: domain or FFFF for any */
+ __u32 keytype; /* in: key type to generate */
+ struct pkey_seckey seckey; /* out: the secure key blob */
+};
+#define PKEY_GENSECK _IOWR(PKEY_IOCTL_MAGIC, 0x01, struct pkey_genseck)
+
+/*
+ * Construct secure key from clear key value
+ */
+struct pkey_clr2seck {
+ __u16 cardnr; /* in: card to use or FFFF for any */
+ __u16 domain; /* in: domain or FFFF for any */
+ __u32 keytype; /* in: key type to generate */
+ struct pkey_clrkey clrkey; /* in: the clear key value */
+ struct pkey_seckey seckey; /* out: the secure key blob */
+};
+#define PKEY_CLR2SECK _IOWR(PKEY_IOCTL_MAGIC, 0x02, struct pkey_clr2seck)
+
+/*
+ * Fabricate protected key from a secure key
+ */
+struct pkey_sec2protk {
+ __u16 cardnr; /* in: card to use or FFFF for any */
+ __u16 domain; /* in: domain or FFFF for any */
+ struct pkey_seckey seckey; /* in: the secure key blob */
+ struct pkey_protkey protkey; /* out: the protected key */
+};
+#define PKEY_SEC2PROTK _IOWR(PKEY_IOCTL_MAGIC, 0x03, struct pkey_sec2protk)
+
+/*
+ * Fabricate protected key from an clear key value
+ */
+struct pkey_clr2protk {
+ __u32 keytype; /* in: key type to generate */
+ struct pkey_clrkey clrkey; /* in: the clear key value */
+ struct pkey_protkey protkey; /* out: the protected key */
+};
+#define PKEY_CLR2PROTK _IOWR(PKEY_IOCTL_MAGIC, 0x04, struct pkey_clr2protk)
+
+/*
+ * Search for matching crypto card based on the Master Key
+ * Verification Pattern provided inside a secure key.
+ */
+struct pkey_findcard {
+ struct pkey_seckey seckey; /* in: the secure key blob */
+ __u16 cardnr; /* out: card number */
+ __u16 domain; /* out: domain number */
+};
+#define PKEY_FINDCARD _IOWR(PKEY_IOCTL_MAGIC, 0x05, struct pkey_findcard)
+
+/*
+ * Combined together: findcard + sec2prot
+ */
+struct pkey_skey2pkey {
+ struct pkey_seckey seckey; /* in: the secure key blob */
+ struct pkey_protkey protkey; /* out: the protected key */
+};
+#define PKEY_SKEY2PKEY _IOWR(PKEY_IOCTL_MAGIC, 0x06, struct pkey_skey2pkey)
+
+#endif /* _UAPI_PKEY_H */
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index db469fa11462..dff2152350a7 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -50,7 +50,8 @@ _TIF_WORK = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
_TIF_UPROBE)
_TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
_TIF_SYSCALL_TRACEPOINT)
-_CIF_WORK = (_CIF_MCCK_PENDING | _CIF_ASCE | _CIF_FPU)
+_CIF_WORK = (_CIF_MCCK_PENDING | _CIF_ASCE_PRIMARY | \
+ _CIF_ASCE_SECONDARY | _CIF_FPU)
_PIF_WORK = (_PIF_PER_TRAP)
#define BASED(name) name-cleanup_critical(%r13)
@@ -339,8 +340,8 @@ ENTRY(system_call)
jo .Lsysc_notify_resume
TSTMSK __LC_CPU_FLAGS,_CIF_FPU
jo .Lsysc_vxrs
- TSTMSK __LC_CPU_FLAGS,_CIF_ASCE
- jo .Lsysc_uaccess
+ TSTMSK __LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY)
+ jnz .Lsysc_asce
j .Lsysc_return # beware of critical section cleanup
#
@@ -358,12 +359,15 @@ ENTRY(system_call)
jg s390_handle_mcck # TIF bit will be cleared by handler
#
-# _CIF_ASCE is set, load user space asce
+# _CIF_ASCE_PRIMARY and/or CIF_ASCE_SECONDARY set, load user space asce
#
-.Lsysc_uaccess:
- ni __LC_CPU_FLAGS+7,255-_CIF_ASCE
+.Lsysc_asce:
+ ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY
lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
- j .Lsysc_return
+ TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_SECONDARY
+ jz .Lsysc_return
+ larl %r14,.Lsysc_return
+ jg set_fs_fixup
#
# CIF_FPU is set, restore floating-point controls and floating-point registers.
@@ -661,8 +665,8 @@ ENTRY(io_int_handler)
jo .Lio_notify_resume
TSTMSK __LC_CPU_FLAGS,_CIF_FPU
jo .Lio_vxrs
- TSTMSK __LC_CPU_FLAGS,_CIF_ASCE
- jo .Lio_uaccess
+ TSTMSK __LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY)
+ jnz .Lio_asce
j .Lio_return # beware of critical section cleanup
#
@@ -675,12 +679,15 @@ ENTRY(io_int_handler)
j .Lio_return
#
-# _CIF_ASCE is set, load user space asce
+# _CIF_ASCE_PRIMARY and/or CIF_ASCE_SECONDARY set, load user space asce
#
-.Lio_uaccess:
- ni __LC_CPU_FLAGS+7,255-_CIF_ASCE
+.Lio_asce:
+ ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY
lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
- j .Lio_return
+ TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_SECONDARY
+ jz .Lio_return
+ larl %r14,.Lio_return
+ jg set_fs_fixup
#
# CIF_FPU is set, restore floating-point controls and floating-point registers.
diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h
index e79f030dd276..33f901865326 100644
--- a/arch/s390/kernel/entry.h
+++ b/arch/s390/kernel/entry.h
@@ -80,5 +80,6 @@ long sys_s390_pci_mmio_read(unsigned long, void __user *, size_t);
DECLARE_PER_CPU(u64, mt_cycles[8]);
void verify_facilities(void);
+void set_fs_fixup(void);
#endif /* _ENTRY_H */
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 56e14d073167..80c093e0c6f1 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -116,6 +116,19 @@ static int notrace s390_validate_registers(union mci mci, int umode)
s390_handle_damage();
kill_task = 1;
}
+ /* Validate control registers */
+ if (!mci.cr) {
+ /*
+ * Control registers have unknown contents.
+ * Can't recover and therefore stopping machine.
+ */
+ s390_handle_damage();
+ } else {
+ asm volatile(
+ " lctlg 0,15,0(%0)\n"
+ " ptlb\n"
+ : : "a" (&S390_lowcore.cregs_save_area) : "memory");
+ }
if (!mci.fp) {
/*
* Floating point registers can't be restored. If the
@@ -208,18 +221,6 @@ static int notrace s390_validate_registers(union mci mci, int umode)
*/
kill_task = 1;
}
- /* Validate control registers */
- if (!mci.cr) {
- /*
- * Control registers have unknown contents.
- * Can't recover and therefore stopping machine.
- */
- s390_handle_damage();
- } else {
- asm volatile(
- " lctlg 0,15,0(%0)"
- : : "a" (&S390_lowcore.cregs_save_area) : "memory");
- }
/*
* We don't even try to validate the TOD register, since we simply
* can't write something sensible into that register.
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index c5b86b4a1a8b..54281660582c 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -100,8 +100,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
return 0;
}
-int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
- unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
+ unsigned long arg, struct task_struct *p, unsigned long tls)
{
struct fake_frame
{
@@ -156,7 +156,6 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
/* Set a new TLS ? */
if (clone_flags & CLONE_SETTLS) {
- unsigned long tls = frame->childregs.gprs[6];
if (is_compat_task()) {
p->thread.acrs[0] = (unsigned int)tls;
} else {
@@ -234,3 +233,16 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
ret = PAGE_ALIGN(mm->brk + brk_rnd());
return (ret > mm->brk) ? ret : mm->brk;
}
+
+void set_fs_fixup(void)
+{
+ struct pt_regs *regs = current_pt_regs();
+ static bool warned;
+
+ set_fs(USER_DS);
+ if (warned)
+ return;
+ WARN(1, "Unbalanced set_fs - int code: 0x%x\n", regs->int_code);
+ show_registers(regs);
+ warned = true;
+}
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 59ac93714fa4..a07b1ec1391d 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -359,8 +359,8 @@ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
spin_lock(&gmap->guest_table_lock);
entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
if (entry) {
- flush = (*entry != _SEGMENT_ENTRY_INVALID);
- *entry = _SEGMENT_ENTRY_INVALID;
+ flush = (*entry != _SEGMENT_ENTRY_EMPTY);
+ *entry = _SEGMENT_ENTRY_EMPTY;
}
spin_unlock(&gmap->guest_table_lock);
return flush;
@@ -589,7 +589,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
return rc;
ptl = pmd_lock(mm, pmd);
spin_lock(&gmap->guest_table_lock);
- if (*table == _SEGMENT_ENTRY_INVALID) {
+ if (*table == _SEGMENT_ENTRY_EMPTY) {
rc = radix_tree_insert(&gmap->host_to_guest,
vmaddr >> PMD_SHIFT, table);
if (!rc)
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index a03816227719..9b4050caa4e9 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -62,7 +62,7 @@ static inline unsigned long __pte_to_rste(pte_t pte)
rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC,
_SEGMENT_ENTRY_NOEXEC);
} else
- rste = _SEGMENT_ENTRY_INVALID;
+ rste = _SEGMENT_ENTRY_EMPTY;
return rste;
}
diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 2cac445b02fd..0b49dbc423e2 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -62,19 +62,32 @@ config CRYPTO_DEV_GEODE
will be called geode-aes.
config ZCRYPT
- tristate "Support for PCI-attached cryptographic adapters"
+ tristate "Support for s390 cryptographic adapters"
depends on S390
select HW_RANDOM
help
- Select this option if you want to use a PCI-attached cryptographic
- adapter like:
- + PCI Cryptographic Accelerator (PCICA)
- + PCI Cryptographic Coprocessor (PCICC)
+ Select this option if you want to enable support for
+ s390 cryptographic adapters like:
+ PCI-X Cryptographic Coprocessor (PCIXCC)
- + Crypto Express2 Coprocessor (CEX2C)
- + Crypto Express2 Accelerator (CEX2A)
- + Crypto Express3 Coprocessor (CEX3C)
- + Crypto Express3 Accelerator (CEX3A)
+ + Crypto Express 2,3,4 or 5 Coprocessor (CEXxC)
+ + Crypto Express 2,3,4 or 5 Accelerator (CEXxA)
+ + Crypto Express 4 or 5 EP11 Coprocessor (CEXxP)
+
+config PKEY
+ tristate "Kernel API for protected key handling"
+ depends on S390
+ depends on ZCRYPT
+ help
+ With this option enabled the pkey kernel module provides an API
+ for creation and handling of protected keys. Other parts of the
+ kernel or userspace applications may use these functions.
+
+ Select this option if you want to enable the kernel and userspace
+ API for proteced key handling.
+
+ Please note that creation of protected keys from secure keys
+ requires to have at least one CEX card in coprocessor mode
+ available at runtime.
config CRYPTO_SHA1_S390
tristate "SHA1 digest algorithm"
@@ -124,6 +137,7 @@ config CRYPTO_AES_S390
depends on S390
select CRYPTO_ALGAPI
select CRYPTO_BLKCIPHER
+ select PKEY
help
This is the s390 hardware accelerated implementation of the
AES cipher algorithms (FIPS-197).
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index edaae9f9853c..e426ac877d19 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -13,6 +13,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
multicast.o mad.o smi.o agent.o mad_rmpp.o
ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
+ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
ib_cm-y := cm.o
diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c
new file mode 100644
index 000000000000..126ac5f99db7
--- /dev/null
+++ b/drivers/infiniband/core/cgroup.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include "core_priv.h"
+
+/**
+ * ib_device_register_rdmacg - register with rdma cgroup.
+ * @device: device to register to participate in resource
+ * accounting by rdma cgroup.
+ *
+ * Register with the rdma cgroup. Should be called before
+ * exposing rdma device to user space applications to avoid
+ * resource accounting leak.
+ * Returns 0 on success or otherwise failure code.
+ */
+int ib_device_register_rdmacg(struct ib_device *device)
+{
+ device->cg_device.name = device->name;
+ return rdmacg_register_device(&device->cg_device);
+}
+
+/**
+ * ib_device_unregister_rdmacg - unregister with rdma cgroup.
+ * @device: device to unregister.
+ *
+ * Unregister with the rdma cgroup. Should be called after
+ * all the resources are deallocated, and after a stage when any
+ * other resource allocation by user application cannot be done
+ * for this device to avoid any leak in accounting.
+ */
+void ib_device_unregister_rdmacg(struct ib_device *device)
+{
+ rdmacg_unregister_device(&device->cg_device);
+}
+
+int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+ struct ib_device *device,
+ enum rdmacg_resource_type resource_index)
+{
+ return rdmacg_try_charge(&cg_obj->cg, &device->cg_device,
+ resource_index);
+}
+EXPORT_SYMBOL(ib_rdmacg_try_charge);
+
+void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+ struct ib_device *device,
+ enum rdmacg_resource_type resource_index)
+{
+ rdmacg_uncharge(cg_obj->cg, &device->cg_device,
+ resource_index);
+}
+EXPORT_SYMBOL(ib_rdmacg_uncharge);
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 912ab4cd6eae..cb7d372e4bdf 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -35,6 +35,7 @@
#include <linux/list.h>
#include <linux/spinlock.h>
+#include <linux/cgroup_rdma.h>
#include <rdma/ib_verbs.h>
@@ -124,6 +125,35 @@ int ib_cache_setup_one(struct ib_device *device);
void ib_cache_cleanup_one(struct ib_device *device);
void ib_cache_release_one(struct ib_device *device);
+#ifdef CONFIG_CGROUP_RDMA
+int ib_device_register_rdmacg(struct ib_device *device);
+void ib_device_unregister_rdmacg(struct ib_device *device);
+
+int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+ struct ib_device *device,
+ enum rdmacg_resource_type resource_index);
+
+void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+ struct ib_device *device,
+ enum rdmacg_resource_type resource_index);
+#else
+static inline int ib_device_register_rdmacg(struct ib_device *device)
+{ return 0; }
+
+static inline void ib_device_unregister_rdmacg(struct ib_device *device)
+{ }
+
+static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+ struct ib_device *device,
+ enum rdmacg_resource_type resource_index)
+{ return 0; }
+
+static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+ struct ib_device *device,
+ enum rdmacg_resource_type resource_index)
+{ }
+#endif
+
static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
struct net_device *upper)
{
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index a63e8400ea3b..593d2ce6ec7c 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -369,10 +369,18 @@ int ib_register_device(struct ib_device *device,
goto out;
}
+ ret = ib_device_register_rdmacg(device);
+ if (ret) {
+ pr_warn("Couldn't register device with rdma cgroup\n");
+ ib_cache_cleanup_one(device);
+ goto out;
+ }
+
memset(&device->attrs, 0, sizeof(device->attrs));
ret = device->query_device(device, &device->attrs, &uhw);
if (ret) {
pr_warn("Couldn't query the device attributes\n");
+ ib_device_unregister_rdmacg(device);
ib_cache_cleanup_one(device);
goto out;
}
@@ -381,6 +389,7 @@ int ib_register_device(struct ib_device *device,
if (ret) {
pr_warn("Couldn't register device %s with driver model\n",
device->name);
+ ib_device_unregister_rdmacg(device);
ib_cache_cleanup_one(device);
goto out;
}
@@ -430,6 +439,7 @@ void ib_unregister_device(struct ib_device *device)
mutex_unlock(&device_mutex);
+ ib_device_unregister_rdmacg(device);
ib_device_unregister_sysfs(device);
ib_cache_cleanup_one(device);
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index b4b395a054ac..7b7a76e1279a 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -316,6 +316,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
struct ib_udata udata;
struct ib_ucontext *ucontext;
struct file *filp;
+ struct ib_rdmacg_object cg_obj;
int ret;
if (out_len < sizeof resp)
@@ -335,13 +336,18 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
(unsigned long) cmd.response + sizeof resp,
in_len - sizeof cmd, out_len - sizeof resp);
+ ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
+ if (ret)
+ goto err;
+
ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
if (IS_ERR(ucontext)) {
ret = PTR_ERR(ucontext);
- goto err;
+ goto err_alloc;
}
ucontext->device = ib_dev;
+ ucontext->cg_obj = cg_obj;
INIT_LIST_HEAD(&ucontext->pd_list);
INIT_LIST_HEAD(&ucontext->mr_list);
INIT_LIST_HEAD(&ucontext->mw_list);
@@ -407,6 +413,9 @@ err_free:
put_pid(ucontext->tgid);
ib_dev->dealloc_ucontext(ucontext);
+err_alloc:
+ ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
+
err:
mutex_unlock(&file->mutex);
return ret;
@@ -561,6 +570,13 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
return -ENOMEM;
init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
+ ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+ RDMACG_RESOURCE_HCA_OBJECT);
+ if (ret) {
+ kfree(uobj);
+ return ret;
+ }
+
down_write(&uobj->mutex);
pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);
@@ -605,6 +621,7 @@ err_idr:
ib_dealloc_pd(pd);
err:
+ ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
put_uobj_write(uobj);
return ret;
}
@@ -637,6 +654,8 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
if (ret)
goto err_put;
+ ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
uobj->live = 0;
put_uobj_write(uobj);
@@ -1006,6 +1025,10 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
goto err_put;
}
}
+ ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+ RDMACG_RESOURCE_HCA_OBJECT);
+ if (ret)
+ goto err_charge;
mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
cmd.access_flags, &udata);
@@ -1054,6 +1077,9 @@ err_unreg:
ib_dereg_mr(mr);
err_put:
+ ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
put_pd_read(pd);
err_free:
@@ -1178,6 +1204,8 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
if (ret)
return ret;
+ ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
mutex_lock(&file->mutex);
@@ -1226,6 +1254,11 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
out_len - sizeof(resp));
+ ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+ RDMACG_RESOURCE_HCA_OBJECT);
+ if (ret)
+ goto err_charge;
+
mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata);
if (IS_ERR(mw)) {
ret = PTR_ERR(mw);
@@ -1271,6 +1304,9 @@ err_unalloc:
uverbs_dealloc_mw(mw);
err_put:
+ ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
put_pd_read(pd);
err_free:
@@ -1306,6 +1342,8 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
if (ret)
return ret;
+ ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
mutex_lock(&file->mutex);
@@ -1405,6 +1443,11 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
attr.flags = cmd->flags;
+ ret = ib_rdmacg_try_charge(&obj->uobject.cg_obj, ib_dev,
+ RDMACG_RESOURCE_HCA_OBJECT);
+ if (ret)
+ goto err_charge;
+
cq = ib_dev->create_cq(ib_dev, &attr,
file->ucontext, uhw);
if (IS_ERR(cq)) {
@@ -1452,6 +1495,10 @@ err_free:
ib_destroy_cq(cq);
err_file:
+ ib_rdmacg_uncharge(&obj->uobject.cg_obj, ib_dev,
+ RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
if (ev_file)
ib_uverbs_release_ucq(file, ev_file, obj);
@@ -1732,6 +1779,8 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
if (ret)
return ret;
+ ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
mutex_lock(&file->mutex);
@@ -1905,6 +1954,11 @@ static int create_qp(struct ib_uverbs_file *file,
goto err_put;
}
+ ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, device,
+ RDMACG_RESOURCE_HCA_OBJECT);
+ if (ret)
+ goto err_put;
+
if (cmd->qp_type == IB_QPT_XRC_TGT)
qp = ib_create_qp(pd, &attr);
else
@@ -1912,7 +1966,7 @@ static int create_qp(struct ib_uverbs_file *file,
if (IS_ERR(qp)) {
ret = PTR_ERR(qp);
- goto err_put;
+ goto err_create;
}
if (cmd->qp_type != IB_QPT_XRC_TGT) {
@@ -1993,6 +2047,10 @@ err_cb:
err_destroy:
ib_destroy_qp(qp);
+err_create:
+ ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, device,
+ RDMACG_RESOURCE_HCA_OBJECT);
+
err_put:
if (xrcd)
put_xrcd_read(xrcd_uobj);
@@ -2519,6 +2577,8 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
if (ret)
return ret;
+ ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
if (obj->uxrcd)
atomic_dec(&obj->uxrcd->refcnt);
@@ -2970,11 +3030,16 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
memset(&attr.dmac, 0, sizeof(attr.dmac));
memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
+ ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+ RDMACG_RESOURCE_HCA_OBJECT);
+ if (ret)
+ goto err_charge;
+
ah = pd->device->create_ah(pd, &attr, &udata);
if (IS_ERR(ah)) {
ret = PTR_ERR(ah);
- goto err_put;
+ goto err_create;
}
ah->device = pd->device;
@@ -3013,7 +3078,10 @@ err_copy:
err_destroy:
ib_destroy_ah(ah);
-err_put:
+err_create:
+ ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
put_pd_read(pd);
err:
@@ -3047,6 +3115,8 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
if (ret)
return ret;
+ ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
mutex_lock(&file->mutex);
@@ -3861,10 +3931,16 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
err = -EINVAL;
goto err_free;
}
+
+ err = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+ RDMACG_RESOURCE_HCA_OBJECT);
+ if (err)
+ goto err_free;
+
flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
if (IS_ERR(flow_id)) {
err = PTR_ERR(flow_id);
- goto err_free;
+ goto err_create;
}
flow_id->uobject = uobj;
uobj->object = flow_id;
@@ -3897,6 +3973,8 @@ err_copy:
idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
destroy_flow:
ib_destroy_flow(flow_id);
+err_create:
+ ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
err_free:
kfree(flow_attr);
err_put:
@@ -3936,8 +4014,11 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
flow_id = uobj->object;
ret = ib_destroy_flow(flow_id);
- if (!ret)
+ if (!ret) {
+ ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev,
+ RDMACG_RESOURCE_HCA_OBJECT);
uobj->live = 0;
+ }
put_uobj_write(uobj);
@@ -4005,6 +4086,11 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
obj->uevent.events_reported = 0;
INIT_LIST_HEAD(&obj->uevent.event_list);
+ ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, ib_dev,
+ RDMACG_RESOURCE_HCA_OBJECT);
+ if (ret)
+ goto err_put_cq;
+
srq = pd->device->create_srq(pd, &attr, udata);
if (IS_ERR(srq)) {
ret = PTR_ERR(srq);
@@ -4069,6 +4155,8 @@ err_destroy:
ib_destroy_srq(srq);
err_put:
+ ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, ib_dev,
+ RDMACG_RESOURCE_HCA_OBJECT);
put_pd_read(pd);
err_put_cq:
@@ -4255,6 +4343,8 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
if (ret)
return ret;
+ ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
if (srq_type == IB_SRQT_XRC) {
us = container_of(obj, struct ib_usrq_object, uevent);
atomic_dec(&us->uxrcd->refcnt);
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index e3fb4b1af1ad..35c788a32e26 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -51,6 +51,7 @@
#include <rdma/ib.h>
#include "uverbs.h"
+#include "core_priv.h"
MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("InfiniBand userspace verbs access");
@@ -237,6 +238,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
ib_destroy_ah(ah);
+ ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+ RDMACG_RESOURCE_HCA_OBJECT);
kfree(uobj);
}
@@ -246,6 +249,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
uverbs_dealloc_mw(mw);
+ ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+ RDMACG_RESOURCE_HCA_OBJECT);
kfree(uobj);
}
@@ -254,6 +259,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
ib_destroy_flow(flow_id);
+ ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+ RDMACG_RESOURCE_HCA_OBJECT);
kfree(uobj);
}
@@ -266,6 +273,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
if (qp == qp->real_qp)
ib_uverbs_detach_umcast(qp, uqp);
ib_destroy_qp(qp);
+ ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+ RDMACG_RESOURCE_HCA_OBJECT);
ib_uverbs_release_uevent(file, &uqp->uevent);
kfree(uqp);
}
@@ -298,6 +307,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
ib_destroy_srq(srq);
+ ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+ RDMACG_RESOURCE_HCA_OBJECT);
ib_uverbs_release_uevent(file, uevent);
kfree(uevent);
}
@@ -310,6 +321,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
ib_destroy_cq(cq);
+ ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+ RDMACG_RESOURCE_HCA_OBJECT);
ib_uverbs_release_ucq(file, ev_file, ucq);
kfree(ucq);
}
@@ -319,6 +332,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
ib_dereg_mr(mr);
+ ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+ RDMACG_RESOURCE_HCA_OBJECT);
kfree(uobj);
}
@@ -339,11 +354,16 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
ib_dealloc_pd(pd);
+ ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+ RDMACG_RESOURCE_HCA_OBJECT);
kfree(uobj);
}
put_pid(context->tgid);
+ ib_rdmacg_uncharge(&context->cg_obj, context->device,
+ RDMACG_RESOURCE_HCA_HANDLE);
+
return context->device->dealloc_ucontext(context);
}
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 5dc673dc9487..ee1b0e9dde79 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -1434,9 +1434,10 @@ config RTC_DRV_SUN4V
based RTC on SUN4V systems.
config RTC_DRV_SUN6I
- tristate "Allwinner A31 RTC"
- default MACH_SUN6I || MACH_SUN8I || COMPILE_TEST
- depends on ARCH_SUNXI
+ bool "Allwinner A31 RTC"
+ default MACH_SUN6I || MACH_SUN8I
+ depends on COMMON_CLK
+ depends on ARCH_SUNXI || COMPILE_TEST
help
If you say Y here you will get support for the RTC found in
some Allwinner SoCs like the A31 or the A64.
@@ -1719,6 +1720,17 @@ config RTC_DRV_R7301
This driver can also be built as a module. If so, the module
will be called rtc-r7301.
+config RTC_DRV_STM32
+ tristate "STM32 RTC"
+ select REGMAP_MMIO
+ depends on ARCH_STM32 || COMPILE_TEST
+ help
+ If you say yes here you get support for the STM32 On-Chip
+ Real Time Clock.
+
+ This driver can also be built as a module, if so, the module
+ will be called "rtc-stm32".
+
comment "HID Sensor RTC drivers"
config RTC_DRV_HID_SENSOR_TIME
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index f13ab1c5c222..f07297b1460a 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -145,6 +145,7 @@ obj-$(CONFIG_RTC_DRV_SNVS) += rtc-snvs.o
obj-$(CONFIG_RTC_DRV_SPEAR) += rtc-spear.o
obj-$(CONFIG_RTC_DRV_STARFIRE) += rtc-starfire.o
obj-$(CONFIG_RTC_DRV_STK17TA8) += rtc-stk17ta8.o
+obj-$(CONFIG_RTC_DRV_STM32) += rtc-stm32.o
obj-$(CONFIG_RTC_DRV_STMP) += rtc-stmp3xxx.o
obj-$(CONFIG_RTC_DRV_ST_LPC) += rtc-st-lpc.o
obj-$(CONFIG_RTC_DRV_SUN4V) += rtc-sun4v.o
diff --git a/drivers/rtc/rtc-armada38x.c b/drivers/rtc/rtc-armada38x.c
index 9a3f2a6f512e..21f355c37eab 100644
--- a/drivers/rtc/rtc-armada38x.c
+++ b/drivers/rtc/rtc-armada38x.c
@@ -16,6 +16,7 @@
#include <linux/io.h>
#include <linux/module.h>
#include <linux/of.h>
+#include <linux/of_device.h>
#include <linux/platform_device.h>
#include <linux/rtc.h>
@@ -23,17 +24,48 @@
#define RTC_STATUS_ALARM1 BIT(0)
#define RTC_STATUS_ALARM2 BIT(1)
#define RTC_IRQ1_CONF 0x4
-#define RTC_IRQ1_AL_EN BIT(0)
-#define RTC_IRQ1_FREQ_EN BIT(1)
-#define RTC_IRQ1_FREQ_1HZ BIT(2)
+#define RTC_IRQ2_CONF 0x8
+#define RTC_IRQ_AL_EN BIT(0)
+#define RTC_IRQ_FREQ_EN BIT(1)
+#define RTC_IRQ_FREQ_1HZ BIT(2)
+
#define RTC_TIME 0xC
#define RTC_ALARM1 0x10
-
-#define SOC_RTC_INTERRUPT 0x8
-#define SOC_RTC_ALARM1 BIT(0)
-#define SOC_RTC_ALARM2 BIT(1)
-#define SOC_RTC_ALARM1_MASK BIT(2)
-#define SOC_RTC_ALARM2_MASK BIT(3)
+#define RTC_ALARM2 0x14
+
+/* Armada38x SoC registers */
+#define RTC_38X_BRIDGE_TIMING_CTL 0x0
+#define RTC_38X_PERIOD_OFFS 0
+#define RTC_38X_PERIOD_MASK (0x3FF << RTC_38X_PERIOD_OFFS)
+#define RTC_38X_READ_DELAY_OFFS 26
+#define RTC_38X_READ_DELAY_MASK (0x1F << RTC_38X_READ_DELAY_OFFS)
+
+/* Armada 7K/8K registers */
+#define RTC_8K_BRIDGE_TIMING_CTL0 0x0
+#define RTC_8K_WRCLK_PERIOD_OFFS 0
+#define RTC_8K_WRCLK_PERIOD_MASK (0xFFFF << RTC_8K_WRCLK_PERIOD_OFFS)
+#define RTC_8K_WRCLK_SETUP_OFFS 16
+#define RTC_8K_WRCLK_SETUP_MASK (0xFFFF << RTC_8K_WRCLK_SETUP_OFFS)
+#define RTC_8K_BRIDGE_TIMING_CTL1 0x4
+#define RTC_8K_READ_DELAY_OFFS 0
+#define RTC_8K_READ_DELAY_MASK (0xFFFF << RTC_8K_READ_DELAY_OFFS)
+
+#define RTC_8K_ISR 0x10
+#define RTC_8K_IMR 0x14
+#define RTC_8K_ALARM2 BIT(0)
+
+#define SOC_RTC_INTERRUPT 0x8
+#define SOC_RTC_ALARM1 BIT(0)
+#define SOC_RTC_ALARM2 BIT(1)
+#define SOC_RTC_ALARM1_MASK BIT(2)
+#define SOC_RTC_ALARM2_MASK BIT(3)
+
+#define SAMPLE_NR 100
+
+struct value_to_freq {
+ u32 value;
+ u8 freq;
+};
struct armada38x_rtc {
struct rtc_device *rtc_dev;
@@ -41,38 +73,153 @@ struct armada38x_rtc {
void __iomem *regs_soc;
spinlock_t lock;
int irq;
+ struct value_to_freq *val_to_freq;
+ struct armada38x_rtc_data *data;
+};
+
+#define ALARM1 0
+#define ALARM2 1
+
+#define ALARM_REG(base, alarm) ((base) + (alarm) * sizeof(u32))
+
+struct armada38x_rtc_data {
+ /* Initialize the RTC-MBUS bridge timing */
+ void (*update_mbus_timing)(struct armada38x_rtc *rtc);
+ u32 (*read_rtc_reg)(struct armada38x_rtc *rtc, u8 rtc_reg);
+ void (*clear_isr)(struct armada38x_rtc *rtc);
+ void (*unmask_interrupt)(struct armada38x_rtc *rtc);
+ u32 alarm;
};
/*
* According to the datasheet, the OS should wait 5us after every
* register write to the RTC hard macro so that the required update
* can occur without holding off the system bus
+ * According to errata RES-3124064, Write to any RTC register
+ * may fail. As a workaround, before writing to RTC
+ * register, issue a dummy write of 0x0 twice to RTC Status
+ * register.
*/
+
static void rtc_delayed_write(u32 val, struct armada38x_rtc *rtc, int offset)
{
+ writel(0, rtc->regs + RTC_STATUS);
+ writel(0, rtc->regs + RTC_STATUS);
writel(val, rtc->regs + offset);
udelay(5);
}
+/* Update RTC-MBUS bridge timing parameters */
+static void rtc_update_38x_mbus_timing_params(struct armada38x_rtc *rtc)
+{
+ u32 reg;
+
+ reg = readl(rtc->regs_soc + RTC_38X_BRIDGE_TIMING_CTL);
+ reg &= ~RTC_38X_PERIOD_MASK;
+ reg |= 0x3FF << RTC_38X_PERIOD_OFFS; /* Maximum value */
+ reg &= ~RTC_38X_READ_DELAY_MASK;
+ reg |= 0x1F << RTC_38X_READ_DELAY_OFFS; /* Maximum value */
+ writel(reg, rtc->regs_soc + RTC_38X_BRIDGE_TIMING_CTL);
+}
+
+static void rtc_update_8k_mbus_timing_params(struct armada38x_rtc *rtc)
+{
+ u32 reg;
+
+ reg = readl(rtc->regs_soc + RTC_8K_BRIDGE_TIMING_CTL0);
+ reg &= ~RTC_8K_WRCLK_PERIOD_MASK;
+ reg |= 0x3FF << RTC_8K_WRCLK_PERIOD_OFFS;
+ reg &= ~RTC_8K_WRCLK_SETUP_MASK;
+ reg |= 0x29 << RTC_8K_WRCLK_SETUP_OFFS;
+ writel(reg, rtc->regs_soc + RTC_8K_BRIDGE_TIMING_CTL0);
+
+ reg = readl(rtc->regs_soc + RTC_8K_BRIDGE_TIMING_CTL1);
+ reg &= ~RTC_8K_READ_DELAY_MASK;
+ reg |= 0x3F << RTC_8K_READ_DELAY_OFFS;
+ writel(reg, rtc->regs_soc + RTC_8K_BRIDGE_TIMING_CTL1);
+}
+
+static u32 read_rtc_register(struct armada38x_rtc *rtc, u8 rtc_reg)
+{
+ return readl(rtc->regs + rtc_reg);
+}
+
+static u32 read_rtc_register_38x_wa(struct armada38x_rtc *rtc, u8 rtc_reg)
+{
+ int i, index_max = 0, max = 0;
+
+ for (i = 0; i < SAMPLE_NR; i++) {
+ rtc->val_to_freq[i].value = readl(rtc->regs + rtc_reg);
+ rtc->val_to_freq[i].freq = 0;
+ }
+
+ for (i = 0; i < SAMPLE_NR; i++) {
+ int j = 0;
+ u32 value = rtc->val_to_freq[i].value;
+
+ while (rtc->val_to_freq[j].freq) {
+ if (rtc->val_to_freq[j].value == value) {
+ rtc->val_to_freq[j].freq++;
+ break;
+ }
+ j++;
+ }
+
+ if (!rtc->val_to_freq[j].freq) {
+ rtc->val_to_freq[j].value = value;
+ rtc->val_to_freq[j].freq = 1;
+ }
+
+ if (rtc->val_to_freq[j].freq > max) {
+ index_max = j;
+ max = rtc->val_to_freq[j].freq;
+ }
+
+ /*
+ * If a value already has half of the sample this is the most
+ * frequent one and we can stop the research right now
+ */
+ if (max > SAMPLE_NR / 2)
+ break;
+ }
+
+ return rtc->val_to_freq[index_max].value;
+}
+
+static void armada38x_clear_isr(struct armada38x_rtc *rtc)
+{
+ u32 val = readl(rtc->regs_soc + SOC_RTC_INTERRUPT);
+
+ writel(val & ~SOC_RTC_ALARM1, rtc->regs_soc + SOC_RTC_INTERRUPT);
+}
+
+static void armada38x_unmask_interrupt(struct armada38x_rtc *rtc)
+{
+ u32 val = readl(rtc->regs_soc + SOC_RTC_INTERRUPT);
+
+ writel(val | SOC_RTC_ALARM1_MASK, rtc->regs_soc + SOC_RTC_INTERRUPT);
+}
+
+static void armada8k_clear_isr(struct armada38x_rtc *rtc)
+{
+ writel(RTC_8K_ALARM2, rtc->regs_soc + RTC_8K_ISR);
+}
+
+static void armada8k_unmask_interrupt(struct armada38x_rtc *rtc)
+{
+ writel(RTC_8K_ALARM2, rtc->regs_soc + RTC_8K_IMR);
+}
+
static int armada38x_rtc_read_time(struct device *dev, struct rtc_time *tm)
{
struct armada38x_rtc *rtc = dev_get_drvdata(dev);
- unsigned long time, time_check, flags;
+ unsigned long time, flags;
spin_lock_irqsave(&rtc->lock, flags);
- time = readl(rtc->regs + RTC_TIME);
- /*
- * WA for failing time set attempts. As stated in HW ERRATA if
- * more than one second between two time reads is detected
- * then read once again.
- */
- time_check = readl(rtc->regs + RTC_TIME);
- if ((time_check - time) > 1)
- time_check = readl(rtc->regs + RTC_TIME);
-
+ time = rtc->data->read_rtc_reg(rtc, RTC_TIME);
spin_unlock_irqrestore(&rtc->lock, flags);
- rtc_time_to_tm(time_check, tm);
+ rtc_time_to_tm(time, tm);
return 0;
}
@@ -87,16 +234,9 @@ static int armada38x_rtc_set_time(struct device *dev, struct rtc_time *tm)
if (ret)
goto out;
- /*
- * According to errata FE-3124064, Write to RTC TIME register
- * may fail. As a workaround, after writing to RTC TIME
- * register, issue a dummy write of 0x0 twice to RTC Status
- * register.
- */
+
spin_lock_irqsave(&rtc->lock, flags);
rtc_delayed_write(time, rtc, RTC_TIME);
- rtc_delayed_write(0, rtc, RTC_STATUS);
- rtc_delayed_write(0, rtc, RTC_STATUS);
spin_unlock_irqrestore(&rtc->lock, flags);
out:
@@ -107,12 +247,14 @@ static int armada38x_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
{
struct armada38x_rtc *rtc = dev_get_drvdata(dev);
unsigned long time, flags;
+ u32 reg = ALARM_REG(RTC_ALARM1, rtc->data->alarm);
+ u32 reg_irq = ALARM_REG(RTC_IRQ1_CONF, rtc->data->alarm);
u32 val;
spin_lock_irqsave(&rtc->lock, flags);
- time = readl(rtc->regs + RTC_ALARM1);
- val = readl(rtc->regs + RTC_IRQ1_CONF) & RTC_IRQ1_AL_EN;
+ time = rtc->data->read_rtc_reg(rtc, reg);
+ val = rtc->data->read_rtc_reg(rtc, reg_irq) & RTC_IRQ_AL_EN;
spin_unlock_irqrestore(&rtc->lock, flags);
@@ -125,9 +267,10 @@ static int armada38x_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
static int armada38x_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
{
struct armada38x_rtc *rtc = dev_get_drvdata(dev);
+ u32 reg = ALARM_REG(RTC_ALARM1, rtc->data->alarm);
+ u32 reg_irq = ALARM_REG(RTC_IRQ1_CONF, rtc->data->alarm);
unsigned long time, flags;
int ret = 0;
- u32 val;
ret = rtc_tm_to_time(&alrm->time, &time);
@@ -136,13 +279,11 @@ static int armada38x_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
spin_lock_irqsave(&rtc->lock, flags);
- rtc_delayed_write(time, rtc, RTC_ALARM1);
+ rtc_delayed_write(time, rtc, reg);
if (alrm->enabled) {
- rtc_delayed_write(RTC_IRQ1_AL_EN, rtc, RTC_IRQ1_CONF);
- val = readl(rtc->regs_soc + SOC_RTC_INTERRUPT);
- writel(val | SOC_RTC_ALARM1_MASK,
- rtc->regs_soc + SOC_RTC_INTERRUPT);
+ rtc_delayed_write(RTC_IRQ_AL_EN, rtc, reg_irq);
+ rtc->data->unmask_interrupt(rtc);
}
spin_unlock_irqrestore(&rtc->lock, flags);
@@ -155,14 +296,15 @@ static int armada38x_rtc_alarm_irq_enable(struct device *dev,
unsigned int enabled)
{
struct armada38x_rtc *rtc = dev_get_drvdata(dev);
+ u32 reg_irq = ALARM_REG(RTC_IRQ1_CONF, rtc->data->alarm);
unsigned long flags;
spin_lock_irqsave(&rtc->lock, flags);
if (enabled)
- rtc_delayed_write(RTC_IRQ1_AL_EN, rtc, RTC_IRQ1_CONF);
+ rtc_delayed_write(RTC_IRQ_AL_EN, rtc, reg_irq);
else
- rtc_delayed_write(0, rtc, RTC_IRQ1_CONF);
+ rtc_delayed_write(0, rtc, reg_irq);
spin_unlock_irqrestore(&rtc->lock, flags);
@@ -174,24 +316,23 @@ static irqreturn_t armada38x_rtc_alarm_irq(int irq, void *data)
struct armada38x_rtc *rtc = data;
u32 val;
int event = RTC_IRQF | RTC_AF;
+ u32 reg_irq = ALARM_REG(RTC_IRQ1_CONF, rtc->data->alarm);
dev_dbg(&rtc->rtc_dev->dev, "%s:irq(%d)\n", __func__, irq);
spin_lock(&rtc->lock);
- val = readl(rtc->regs_soc + SOC_RTC_INTERRUPT);
-
- writel(val & ~SOC_RTC_ALARM1, rtc->regs_soc + SOC_RTC_INTERRUPT);
- val = readl(rtc->regs + RTC_IRQ1_CONF);
- /* disable all the interrupts for alarm 1 */
- rtc_delayed_write(0, rtc, RTC_IRQ1_CONF);
+ rtc->data->clear_isr(rtc);
+ val = rtc->data->read_rtc_reg(rtc, reg_irq);
+ /* disable all the interrupts for alarm*/
+ rtc_delayed_write(0, rtc, reg_irq);
/* Ack the event */
- rtc_delayed_write(RTC_STATUS_ALARM1, rtc, RTC_STATUS);
+ rtc_delayed_write(1 << rtc->data->alarm, rtc, RTC_STATUS);
spin_unlock(&rtc->lock);
- if (val & RTC_IRQ1_FREQ_EN) {
- if (val & RTC_IRQ1_FREQ_1HZ)
+ if (val & RTC_IRQ_FREQ_EN) {
+ if (val & RTC_IRQ_FREQ_1HZ)
event |= RTC_UF;
else
event |= RTC_PF;
@@ -202,7 +343,7 @@ static irqreturn_t armada38x_rtc_alarm_irq(int irq, void *data)
return IRQ_HANDLED;
}
-static struct rtc_class_ops armada38x_rtc_ops = {
+static const struct rtc_class_ops armada38x_rtc_ops = {
.read_time = armada38x_rtc_read_time,
.set_time = armada38x_rtc_set_time,
.read_alarm = armada38x_rtc_read_alarm,
@@ -210,17 +351,65 @@ static struct rtc_class_ops armada38x_rtc_ops = {
.alarm_irq_enable = armada38x_rtc_alarm_irq_enable,
};
+static const struct rtc_class_ops armada38x_rtc_ops_noirq = {
+ .read_time = armada38x_rtc_read_time,
+ .set_time = armada38x_rtc_set_time,
+ .read_alarm = armada38x_rtc_read_alarm,
+};
+
+static const struct armada38x_rtc_data armada38x_data = {
+ .update_mbus_timing = rtc_update_38x_mbus_timing_params,
+ .read_rtc_reg = read_rtc_register_38x_wa,
+ .clear_isr = armada38x_clear_isr,
+ .unmask_interrupt = armada38x_unmask_interrupt,
+ .alarm = ALARM1,
+};
+
+static const struct armada38x_rtc_data armada8k_data = {
+ .update_mbus_timing = rtc_update_8k_mbus_timing_params,
+ .read_rtc_reg = read_rtc_register,
+ .clear_isr = armada8k_clear_isr,
+ .unmask_interrupt = armada8k_unmask_interrupt,
+ .alarm = ALARM2,
+};
+
+#ifdef CONFIG_OF
+static const struct of_device_id armada38x_rtc_of_match_table[] = {
+ {
+ .compatible = "marvell,armada-380-rtc",
+ .data = &armada38x_data,
+ },
+ {
+ .compatible = "marvell,armada-8k-rtc",
+ .data = &armada8k_data,
+ },
+ {}
+};
+MODULE_DEVICE_TABLE(of, armada38x_rtc_of_match_table);
+#endif
+
static __init int armada38x_rtc_probe(struct platform_device *pdev)
{
+ const struct rtc_class_ops *ops;
struct resource *res;
struct armada38x_rtc *rtc;
+ const struct of_device_id *match;
int ret;
+ match = of_match_device(armada38x_rtc_of_match_table, &pdev->dev);
+ if (!match)
+ return -ENODEV;
+
rtc = devm_kzalloc(&pdev->dev, sizeof(struct armada38x_rtc),
GFP_KERNEL);
if (!rtc)
return -ENOMEM;
+ rtc->val_to_freq = devm_kcalloc(&pdev->dev, SAMPLE_NR,
+ sizeof(struct value_to_freq), GFP_KERNEL);
+ if (!rtc->val_to_freq)
+ return -ENOMEM;
+
spin_lock_init(&rtc->lock);
res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "rtc");
@@ -242,19 +431,27 @@ static __init int armada38x_rtc_probe(struct platform_device *pdev)
0, pdev->name, rtc) < 0) {
dev_warn(&pdev->dev, "Interrupt not available.\n");
rtc->irq = -1;
+ }
+ platform_set_drvdata(pdev, rtc);
+
+ if (rtc->irq != -1) {
+ device_init_wakeup(&pdev->dev, 1);
+ ops = &armada38x_rtc_ops;
+ } else {
/*
* If there is no interrupt available then we can't
* use the alarm
*/
- armada38x_rtc_ops.set_alarm = NULL;
- armada38x_rtc_ops.alarm_irq_enable = NULL;
+ ops = &armada38x_rtc_ops_noirq;
}
- platform_set_drvdata(pdev, rtc);
- if (rtc->irq != -1)
- device_init_wakeup(&pdev->dev, 1);
+ rtc->data = (struct armada38x_rtc_data *)match->data;
+
+
+ /* Update RTC-MBUS bridge timing parameters */
+ rtc->data->update_mbus_timing(rtc);
rtc->rtc_dev = devm_rtc_device_register(&pdev->dev, pdev->name,
- &armada38x_rtc_ops, THIS_MODULE);
+ ops, THIS_MODULE);
if (IS_ERR(rtc->rtc_dev)) {
ret = PTR_ERR(rtc->rtc_dev);
dev_err(&pdev->dev, "Failed to register RTC device: %d\n", ret);
@@ -280,6 +477,9 @@ static int armada38x_rtc_resume(struct device *dev)
if (device_may_wakeup(dev)) {
struct armada38x_rtc *rtc = dev_get_drvdata(dev);
+ /* Update RTC-MBUS bridge timing parameters */
+ rtc->data->update_mbus_timing(rtc);
+
return disable_irq_wake(rtc->irq);
}
@@ -290,14 +490,6 @@ static int armada38x_rtc_resume(struct device *dev)
static SIMPLE_DEV_PM_OPS(armada38x_rtc_pm_ops,
armada38x_rtc_suspend, armada38x_rtc_resume);
-#ifdef CONFIG_OF
-static const struct of_device_id armada38x_rtc_of_match_table[] = {
- { .compatible = "marvell,armada-380-rtc", },
- {}
-};
-MODULE_DEVICE_TABLE(of, armada38x_rtc_of_match_table);
-#endif
-
static struct platform_driver armada38x_rtc_driver = {
.driver = {
.name = "armada38x-rtc",
diff --git a/drivers/rtc/rtc-au1xxx.c b/drivers/rtc/rtc-au1xxx.c
index 84d6e026784d..2ba44ccb9c3a 100644
--- a/drivers/rtc/rtc-au1xxx.c
+++ b/drivers/rtc/rtc-au1xxx.c
@@ -56,7 +56,7 @@ static int au1xtoy_rtc_set_time(struct device *dev, struct rtc_time *tm)
return 0;
}
-static struct rtc_class_ops au1xtoy_rtc_ops = {
+static const struct rtc_class_ops au1xtoy_rtc_ops = {
.read_time = au1xtoy_rtc_read_time,
.set_time = au1xtoy_rtc_set_time,
};
diff --git a/drivers/rtc/rtc-bfin.c b/drivers/rtc/rtc-bfin.c
index 535a5f9338d0..15344b7c07c5 100644
--- a/drivers/rtc/rtc-bfin.c
+++ b/drivers/rtc/rtc-bfin.c
@@ -333,7 +333,7 @@ static int bfin_rtc_proc(struct device *dev, struct seq_file *seq)
#undef yesno
}
-static struct rtc_class_ops bfin_rtc_ops = {
+static const struct rtc_class_ops bfin_rtc_ops = {
.read_time = bfin_rtc_read_time,
.set_time = bfin_rtc_set_time,
.read_alarm = bfin_rtc_read_alarm,
diff --git a/drivers/rtc/rtc-bq32k.c b/drivers/rtc/rtc-bq32k.c
index 397742446007..2b223935001f 100644
--- a/drivers/rtc/rtc-bq32k.c
+++ b/drivers/rtc/rtc-bq32k.c
@@ -34,6 +34,7 @@
#define BQ32K_CALIBRATION 0x07 /* CAL_CFG1, calibration and control */
#define BQ32K_TCH2 0x08 /* Trickle charge enable */
#define BQ32K_CFG2 0x09 /* Trickle charger control */
+#define BQ32K_TCFE BIT(6) /* Trickle charge FET bypass */
struct bq32k_regs {
uint8_t seconds;
@@ -188,6 +189,65 @@ static int trickle_charger_of_init(struct device *dev, struct device_node *node)
return 0;
}
+static ssize_t bq32k_sysfs_show_tricklecharge_bypass(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ int reg, error;
+
+ error = bq32k_read(dev, &reg, BQ32K_CFG2, 1);
+ if (error)
+ return error;
+
+ return sprintf(buf, "%d\n", (reg & BQ32K_TCFE) ? 1 : 0);
+}
+
+static ssize_t bq32k_sysfs_store_tricklecharge_bypass(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int reg, enable, error;
+
+ if (kstrtoint(buf, 0, &enable))
+ return -EINVAL;
+
+ error = bq32k_read(dev, &reg, BQ32K_CFG2, 1);
+ if (error)
+ return error;
+
+ if (enable) {
+ reg |= BQ32K_TCFE;
+ error = bq32k_write(dev, &reg, BQ32K_CFG2, 1);
+ if (error)
+ return error;
+
+ dev_info(dev, "Enabled trickle charge FET bypass.\n");
+ } else {
+ reg &= ~BQ32K_TCFE;
+ error = bq32k_write(dev, &reg, BQ32K_CFG2, 1);
+ if (error)
+ return error;
+
+ dev_info(dev, "Disabled trickle charge FET bypass.\n");
+ }
+
+ return count;
+}
+
+static DEVICE_ATTR(trickle_charge_bypass, 0644,
+ bq32k_sysfs_show_tricklecharge_bypass,
+ bq32k_sysfs_store_tricklecharge_bypass);
+
+static int bq32k_sysfs_register(struct device *dev)
+{
+ return device_create_file(dev, &dev_attr_trickle_charge_bypass);
+}
+
+static void bq32k_sysfs_unregister(struct device *dev)
+{
+ device_remove_file(dev, &dev_attr_trickle_charge_bypass);
+}
+
static int bq32k_probe(struct i2c_client *client,
const struct i2c_device_id *id)
{
@@ -224,11 +284,26 @@ static int bq32k_probe(struct i2c_client *client,
if (IS_ERR(rtc))
return PTR_ERR(rtc);
+ error = bq32k_sysfs_register(&client->dev);
+ if (error) {
+ dev_err(&client->dev,
+ "Unable to create sysfs entries for rtc bq32000\n");
+ return error;
+ }
+
+
i2c_set_clientdata(client, rtc);
return 0;
}
+static int bq32k_remove(struct i2c_client *client)
+{
+ bq32k_sysfs_unregister(&client->dev);
+
+ return 0;
+}
+
static const struct i2c_device_id bq32k_id[] = {
{ "bq32000", 0 },
{ }
@@ -240,6 +315,7 @@ static struct i2c_driver bq32k_driver = {
.name = "bq32k",
},
.probe = bq32k_probe,
+ .remove = bq32k_remove,
.id_table = bq32k_id,
};
diff --git a/drivers/rtc/rtc-dm355evm.c b/drivers/rtc/rtc-dm355evm.c
index 94067f8eeb10..f225cd873ff6 100644
--- a/drivers/rtc/rtc-dm355evm.c
+++ b/drivers/rtc/rtc-dm355evm.c
@@ -116,7 +116,7 @@ static int dm355evm_rtc_set_time(struct device *dev, struct rtc_time *tm)
return 0;
}
-static struct rtc_class_ops dm355evm_rtc_ops = {
+static const struct rtc_class_ops dm355evm_rtc_ops = {
.read_time = dm355evm_rtc_read_time,
.set_time = dm355evm_rtc_set_time,
};
diff --git a/drivers/rtc/rtc-ds3232.c b/drivers/rtc/rtc-ds3232.c
index b1f20d8c358f..9bb39a06b994 100644
--- a/drivers/rtc/rtc-ds3232.c
+++ b/drivers/rtc/rtc-ds3232.c
@@ -23,28 +23,28 @@
#include <linux/slab.h>
#include <linux/regmap.h>
-#define DS3232_REG_SECONDS 0x00
-#define DS3232_REG_MINUTES 0x01
-#define DS3232_REG_HOURS 0x02
-#define DS3232_REG_AMPM 0x02
-#define DS3232_REG_DAY 0x03
-#define DS3232_REG_DATE 0x04
-#define DS3232_REG_MONTH 0x05
-#define DS3232_REG_CENTURY 0x05
-#define DS3232_REG_YEAR 0x06
-#define DS3232_REG_ALARM1 0x07 /* Alarm 1 BASE */
-#define DS3232_REG_ALARM2 0x0B /* Alarm 2 BASE */
-#define DS3232_REG_CR 0x0E /* Control register */
-# define DS3232_REG_CR_nEOSC 0x80
-# define DS3232_REG_CR_INTCN 0x04
-# define DS3232_REG_CR_A2IE 0x02
-# define DS3232_REG_CR_A1IE 0x01
-
-#define DS3232_REG_SR 0x0F /* control/status register */
-# define DS3232_REG_SR_OSF 0x80
-# define DS3232_REG_SR_BSY 0x04
-# define DS3232_REG_SR_A2F 0x02
-# define DS3232_REG_SR_A1F 0x01
+#define DS3232_REG_SECONDS 0x00
+#define DS3232_REG_MINUTES 0x01
+#define DS3232_REG_HOURS 0x02
+#define DS3232_REG_AMPM 0x02
+#define DS3232_REG_DAY 0x03
+#define DS3232_REG_DATE 0x04
+#define DS3232_REG_MONTH 0x05
+#define DS3232_REG_CENTURY 0x05
+#define DS3232_REG_YEAR 0x06
+#define DS3232_REG_ALARM1 0x07 /* Alarm 1 BASE */
+#define DS3232_REG_ALARM2 0x0B /* Alarm 2 BASE */
+#define DS3232_REG_CR 0x0E /* Control register */
+# define DS3232_REG_CR_nEOSC 0x80
+# define DS3232_REG_CR_INTCN 0x04
+# define DS3232_REG_CR_A2IE 0x02
+# define DS3232_REG_CR_A1IE 0x01
+
+#define DS3232_REG_SR 0x0F /* control/status register */
+# define DS3232_REG_SR_OSF 0x80
+# define DS3232_REG_SR_BSY 0x04
+# define DS3232_REG_SR_A2F 0x02
+# define DS3232_REG_SR_A1F 0x01
struct ds3232 {
struct device *dev;
@@ -363,6 +363,9 @@ static int ds3232_probe(struct device *dev, struct regmap *regmap, int irq,
if (ret)
return ret;
+ if (ds3232->irq > 0)
+ device_init_wakeup(dev, 1);
+
ds3232->rtc = devm_rtc_device_register(dev, name, &ds3232_rtc_ops,
THIS_MODULE);
if (IS_ERR(ds3232->rtc))
@@ -374,10 +377,10 @@ static int ds3232_probe(struct device *dev, struct regmap *regmap, int irq,
IRQF_SHARED | IRQF_ONESHOT,
name, dev);
if (ret) {
+ device_set_wakeup_capable(dev, 0);
ds3232->irq = 0;
dev_err(dev, "unable to request IRQ\n");
- } else
- device_init_wakeup(dev, 1);
+ }
}
return 0;
@@ -420,6 +423,7 @@ static int ds3232_i2c_probe(struct i2c_client *client,
static const struct regmap_config config = {
.reg_bits = 8,
.val_bits = 8,
+ .max_register = 0x13,
};
regmap = devm_regmap_init_i2c(client, &config);
@@ -479,6 +483,7 @@ static int ds3234_probe(struct spi_device *spi)
static const struct regmap_config config = {
.reg_bits = 8,
.val_bits = 8,
+ .max_register = 0x13,
.write_flag_mask = 0x80,
};
struct regmap *regmap;
diff --git a/drivers/rtc/rtc-gemini.c b/drivers/rtc/rtc-gemini.c
index 688debc14348..ccf0dbadb62d 100644
--- a/drivers/rtc/rtc-gemini.c
+++ b/drivers/rtc/rtc-gemini.c
@@ -159,9 +159,16 @@ static int gemini_rtc_remove(struct platform_device *pdev)
return 0;
}
+static const struct of_device_id gemini_rtc_dt_match[] = {
+ { .compatible = "cortina,gemini-rtc" },
+ { }
+};
+MODULE_DEVICE_TABLE(of, gemini_rtc_dt_match);
+
static struct platform_driver gemini_rtc_driver = {
.driver = {
.name = DRV_NAME,
+ .of_match_table = gemini_rtc_dt_match,
},
.probe = gemini_rtc_probe,
.remove = gemini_rtc_remove,
diff --git a/drivers/rtc/rtc-imxdi.c b/drivers/rtc/rtc-imxdi.c
index 67b56b80dc70..6b54f6c24c5f 100644
--- a/drivers/rtc/rtc-imxdi.c
+++ b/drivers/rtc/rtc-imxdi.c
@@ -108,7 +108,6 @@
* @pdev: pionter to platform dev
* @rtc: pointer to rtc struct
* @ioaddr: IO registers pointer
- * @irq: dryice normal interrupt
* @clk: input reference clock
* @dsr: copy of the DSR register
* @irq_lock: interrupt enable register (DIER) lock
@@ -120,7 +119,6 @@ struct imxdi_dev {
struct platform_device *pdev;
struct rtc_device *rtc;
void __iomem *ioaddr;
- int irq;
struct clk *clk;
u32 dsr;
spinlock_t irq_lock;
@@ -668,7 +666,7 @@ static int dryice_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm)
return 0;
}
-static struct rtc_class_ops dryice_rtc_ops = {
+static const struct rtc_class_ops dryice_rtc_ops = {
.read_time = dryice_rtc_read_time,
.set_mmss = dryice_rtc_set_mmss,
.alarm_irq_enable = dryice_rtc_alarm_irq_enable,
@@ -677,9 +675,9 @@ static struct rtc_class_ops dryice_rtc_ops = {
};
/*
- * dryice "normal" interrupt handler
+ * interrupt handler for dryice "normal" and security violation interrupt
*/
-static irqreturn_t dryice_norm_irq(int irq, void *dev_id)
+static irqreturn_t dryice_irq(int irq, void *dev_id)
{
struct imxdi_dev *imxdi = dev_id;
u32 dsr, dier;
@@ -765,6 +763,7 @@ static int __init dryice_rtc_probe(struct platform_device *pdev)
{
struct resource *res;
struct imxdi_dev *imxdi;
+ int norm_irq, sec_irq;
int rc;
imxdi = devm_kzalloc(&pdev->dev, sizeof(*imxdi), GFP_KERNEL);
@@ -780,9 +779,16 @@ static int __init dryice_rtc_probe(struct platform_device *pdev)
spin_lock_init(&imxdi->irq_lock);
- imxdi->irq = platform_get_irq(pdev, 0);
- if (imxdi->irq < 0)
- return imxdi->irq;
+ norm_irq = platform_get_irq(pdev, 0);
+ if (norm_irq < 0)
+ return norm_irq;
+
+ /* the 2nd irq is the security violation irq
+ * make this optional, don't break the device tree ABI
+ */
+ sec_irq = platform_get_irq(pdev, 1);
+ if (sec_irq <= 0)
+ sec_irq = IRQ_NOTCONNECTED;
init_waitqueue_head(&imxdi->write_wait);
@@ -808,13 +814,20 @@ static int __init dryice_rtc_probe(struct platform_device *pdev)
if (rc != 0)
goto err;
- rc = devm_request_irq(&pdev->dev, imxdi->irq, dryice_norm_irq,
- IRQF_SHARED, pdev->name, imxdi);
+ rc = devm_request_irq(&pdev->dev, norm_irq, dryice_irq,
+ IRQF_SHARED, pdev->name, imxdi);
if (rc) {
dev_warn(&pdev->dev, "interrupt not available.\n");
goto err;
}
+ rc = devm_request_irq(&pdev->dev, sec_irq, dryice_irq,
+ IRQF_SHARED, pdev->name, imxdi);
+ if (rc) {
+ dev_warn(&pdev->dev, "security violation interrupt not available.\n");
+ /* this is not an error, see above */
+ }
+
platform_set_drvdata(pdev, imxdi);
imxdi->rtc = devm_rtc_device_register(&pdev->dev, pdev->name,
&dryice_rtc_ops, THIS_MODULE);
diff --git a/drivers/rtc/rtc-ls1x.c b/drivers/rtc/rtc-ls1x.c
index 22a9ec4f2b83..e04ca54f21e2 100644
--- a/drivers/rtc/rtc-ls1x.c
+++ b/drivers/rtc/rtc-ls1x.c
@@ -138,7 +138,7 @@ err:
return ret;
}
-static struct rtc_class_ops ls1x_rtc_ops = {
+static const struct rtc_class_ops ls1x_rtc_ops = {
.read_time = ls1x_rtc_read_time,
.set_time = ls1x_rtc_set_time,
};
diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c
index 0eeb5714c00f..02af045305dd 100644
--- a/drivers/rtc/rtc-m48t86.c
+++ b/drivers/rtc/rtc-m48t86.c
@@ -16,62 +16,88 @@
#include <linux/module.h>
#include <linux/rtc.h>
#include <linux/platform_device.h>
-#include <linux/platform_data/rtc-m48t86.h>
#include <linux/bcd.h>
+#include <linux/io.h>
-#define M48T86_REG_SEC 0x00
-#define M48T86_REG_SECALRM 0x01
-#define M48T86_REG_MIN 0x02
-#define M48T86_REG_MINALRM 0x03
-#define M48T86_REG_HOUR 0x04
-#define M48T86_REG_HOURALRM 0x05
-#define M48T86_REG_DOW 0x06 /* 1 = sunday */
-#define M48T86_REG_DOM 0x07
-#define M48T86_REG_MONTH 0x08 /* 1 - 12 */
-#define M48T86_REG_YEAR 0x09 /* 0 - 99 */
-#define M48T86_REG_A 0x0A
-#define M48T86_REG_B 0x0B
-#define M48T86_REG_C 0x0C
-#define M48T86_REG_D 0x0D
-
-#define M48T86_REG_B_H24 (1 << 1)
-#define M48T86_REG_B_DM (1 << 2)
-#define M48T86_REG_B_SET (1 << 7)
-#define M48T86_REG_D_VRT (1 << 7)
+#define M48T86_SEC 0x00
+#define M48T86_SECALRM 0x01
+#define M48T86_MIN 0x02
+#define M48T86_MINALRM 0x03
+#define M48T86_HOUR 0x04
+#define M48T86_HOURALRM 0x05
+#define M48T86_DOW 0x06 /* 1 = sunday */
+#define M48T86_DOM 0x07
+#define M48T86_MONTH 0x08 /* 1 - 12 */
+#define M48T86_YEAR 0x09 /* 0 - 99 */
+#define M48T86_A 0x0a
+#define M48T86_B 0x0b
+#define M48T86_B_SET BIT(7)
+#define M48T86_B_DM BIT(2)
+#define M48T86_B_H24 BIT(1)
+#define M48T86_C 0x0c
+#define M48T86_D 0x0d
+#define M48T86_D_VRT BIT(7)
+#define M48T86_NVRAM(x) (0x0e + (x))
+#define M48T86_NVRAM_LEN 114
+
+struct m48t86_rtc_info {
+ void __iomem *index_reg;
+ void __iomem *data_reg;
+ struct rtc_device *rtc;
+};
+
+static unsigned char m48t86_readb(struct device *dev, unsigned long addr)
+{
+ struct m48t86_rtc_info *info = dev_get_drvdata(dev);
+ unsigned char value;
+
+ writeb(addr, info->index_reg);
+ value = readb(info->data_reg);
+
+ return value;
+}
+
+static void m48t86_writeb(struct device *dev,
+ unsigned char value, unsigned long addr)
+{
+ struct m48t86_rtc_info *info = dev_get_drvdata(dev);
+
+ writeb(addr, info->index_reg);
+ writeb(value, info->data_reg);
+}
static int m48t86_rtc_read_time(struct device *dev, struct rtc_time *tm)
{
unsigned char reg;
- struct platform_device *pdev = to_platform_device(dev);
- struct m48t86_ops *ops = dev_get_platdata(&pdev->dev);
- reg = ops->readbyte(M48T86_REG_B);
+ reg = m48t86_readb(dev, M48T86_B);
- if (reg & M48T86_REG_B_DM) {
+ if (reg & M48T86_B_DM) {
/* data (binary) mode */
- tm->tm_sec = ops->readbyte(M48T86_REG_SEC);
- tm->tm_min = ops->readbyte(M48T86_REG_MIN);
- tm->tm_hour = ops->readbyte(M48T86_REG_HOUR) & 0x3F;
- tm->tm_mday = ops->readbyte(M48T86_REG_DOM);
+ tm->tm_sec = m48t86_readb(dev, M48T86_SEC);
+ tm->tm_min = m48t86_readb(dev, M48T86_MIN);
+ tm->tm_hour = m48t86_readb(dev, M48T86_HOUR) & 0x3f;
+ tm->tm_mday = m48t86_readb(dev, M48T86_DOM);
/* tm_mon is 0-11 */
- tm->tm_mon = ops->readbyte(M48T86_REG_MONTH) - 1;
- tm->tm_year = ops->readbyte(M48T86_REG_YEAR) + 100;
- tm->tm_wday = ops->readbyte(M48T86_REG_DOW);
+ tm->tm_mon = m48t86_readb(dev, M48T86_MONTH) - 1;
+ tm->tm_year = m48t86_readb(dev, M48T86_YEAR) + 100;
+ tm->tm_wday = m48t86_readb(dev, M48T86_DOW);
} else {
/* bcd mode */
- tm->tm_sec = bcd2bin(ops->readbyte(M48T86_REG_SEC));
- tm->tm_min = bcd2bin(ops->readbyte(M48T86_REG_MIN));
- tm->tm_hour = bcd2bin(ops->readbyte(M48T86_REG_HOUR) & 0x3F);
- tm->tm_mday = bcd2bin(ops->readbyte(M48T86_REG_DOM));
+ tm->tm_sec = bcd2bin(m48t86_readb(dev, M48T86_SEC));
+ tm->tm_min = bcd2bin(m48t86_readb(dev, M48T86_MIN));
+ tm->tm_hour = bcd2bin(m48t86_readb(dev, M48T86_HOUR) &
+ 0x3f);
+ tm->tm_mday = bcd2bin(m48t86_readb(dev, M48T86_DOM));
/* tm_mon is 0-11 */
- tm->tm_mon = bcd2bin(ops->readbyte(M48T86_REG_MONTH)) - 1;
- tm->tm_year = bcd2bin(ops->readbyte(M48T86_REG_YEAR)) + 100;
- tm->tm_wday = bcd2bin(ops->readbyte(M48T86_REG_DOW));
+ tm->tm_mon = bcd2bin(m48t86_readb(dev, M48T86_MONTH)) - 1;
+ tm->tm_year = bcd2bin(m48t86_readb(dev, M48T86_YEAR)) + 100;
+ tm->tm_wday = bcd2bin(m48t86_readb(dev, M48T86_DOW));
}
/* correct the hour if the clock is in 12h mode */
- if (!(reg & M48T86_REG_B_H24))
- if (ops->readbyte(M48T86_REG_HOUR) & 0x80)
+ if (!(reg & M48T86_B_H24))
+ if (m48t86_readb(dev, M48T86_HOUR) & 0x80)
tm->tm_hour += 12;
return rtc_valid_tm(tm);
@@ -80,38 +106,36 @@ static int m48t86_rtc_read_time(struct device *dev, struct rtc_time *tm)
static int m48t86_rtc_set_time(struct device *dev, struct rtc_time *tm)
{
unsigned char reg;
- struct platform_device *pdev = to_platform_device(dev);
- struct m48t86_ops *ops = dev_get_platdata(&pdev->dev);
- reg = ops->readbyte(M48T86_REG_B);
+ reg = m48t86_readb(dev, M48T86_B);
/* update flag and 24h mode */
- reg |= M48T86_REG_B_SET | M48T86_REG_B_H24;
- ops->writebyte(reg, M48T86_REG_B);
+ reg |= M48T86_B_SET | M48T86_B_H24;
+ m48t86_writeb(dev, reg, M48T86_B);
- if (reg & M48T86_REG_B_DM) {
+ if (reg & M48T86_B_DM) {
/* data (binary) mode */
- ops->writebyte(tm->tm_sec, M48T86_REG_SEC);
- ops->writebyte(tm->tm_min, M48T86_REG_MIN);
- ops->writebyte(tm->tm_hour, M48T86_REG_HOUR);
- ops->writebyte(tm->tm_mday, M48T86_REG_DOM);
- ops->writebyte(tm->tm_mon + 1, M48T86_REG_MONTH);
- ops->writebyte(tm->tm_year % 100, M48T86_REG_YEAR);
- ops->writebyte(tm->tm_wday, M48T86_REG_DOW);
+ m48t86_writeb(dev, tm->tm_sec, M48T86_SEC);
+ m48t86_writeb(dev, tm->tm_min, M48T86_MIN);
+ m48t86_writeb(dev, tm->tm_hour, M48T86_HOUR);
+ m48t86_writeb(dev, tm->tm_mday, M48T86_DOM);
+ m48t86_writeb(dev, tm->tm_mon + 1, M48T86_MONTH);
+ m48t86_writeb(dev, tm->tm_year % 100, M48T86_YEAR);
+ m48t86_writeb(dev, tm->tm_wday, M48T86_DOW);
} else {
/* bcd mode */
- ops->writebyte(bin2bcd(tm->tm_sec), M48T86_REG_SEC);
- ops->writebyte(bin2bcd(tm->tm_min), M48T86_REG_MIN);
- ops->writebyte(bin2bcd(tm->tm_hour), M48T86_REG_HOUR);
- ops->writebyte(bin2bcd(tm->tm_mday), M48T86_REG_DOM);
- ops->writebyte(bin2bcd(tm->tm_mon + 1), M48T86_REG_MONTH);
- ops->writebyte(bin2bcd(tm->tm_year % 100), M48T86_REG_YEAR);
- ops->writebyte(bin2bcd(tm->tm_wday), M48T86_REG_DOW);
+ m48t86_writeb(dev, bin2bcd(tm->tm_sec), M48T86_SEC);
+ m48t86_writeb(dev, bin2bcd(tm->tm_min), M48T86_MIN);
+ m48t86_writeb(dev, bin2bcd(tm->tm_hour), M48T86_HOUR);
+ m48t86_writeb(dev, bin2bcd(tm->tm_mday), M48T86_DOM);
+ m48t86_writeb(dev, bin2bcd(tm->tm_mon + 1), M48T86_MONTH);
+ m48t86_writeb(dev, bin2bcd(tm->tm_year % 100), M48T86_YEAR);
+ m48t86_writeb(dev, bin2bcd(tm->tm_wday), M48T86_DOW);
}
/* update ended */
- reg &= ~M48T86_REG_B_SET;
- ops->writebyte(reg, M48T86_REG_B);
+ reg &= ~M48T86_B_SET;
+ m48t86_writeb(dev, reg, M48T86_B);
return 0;
}
@@ -119,18 +143,16 @@ static int m48t86_rtc_set_time(struct device *dev, struct rtc_time *tm)
static int m48t86_rtc_proc(struct device *dev, struct seq_file *seq)
{
unsigned char reg;
- struct platform_device *pdev = to_platform_device(dev);
- struct m48t86_ops *ops = dev_get_platdata(&pdev->dev);
- reg = ops->readbyte(M48T86_REG_B);
+ reg = m48t86_readb(dev, M48T86_B);
seq_printf(seq, "mode\t\t: %s\n",
- (reg & M48T86_REG_B_DM) ? "binary" : "bcd");
+ (reg & M48T86_B_DM) ? "binary" : "bcd");
- reg = ops->readbyte(M48T86_REG_D);
+ reg = m48t86_readb(dev, M48T86_D);
seq_printf(seq, "battery\t\t: %s\n",
- (reg & M48T86_REG_D_VRT) ? "ok" : "exhausted");
+ (reg & M48T86_D_VRT) ? "ok" : "exhausted");
return 0;
}
@@ -141,25 +163,116 @@ static const struct rtc_class_ops m48t86_rtc_ops = {
.proc = m48t86_rtc_proc,
};
-static int m48t86_rtc_probe(struct platform_device *dev)
+static ssize_t m48t86_nvram_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr,
+ char *buf, loff_t off, size_t count)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ unsigned int i;
+
+ for (i = 0; i < count; i++)
+ buf[i] = m48t86_readb(dev, M48T86_NVRAM(off + i));
+
+ return count;
+}
+
+static ssize_t m48t86_nvram_write(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr,
+ char *buf, loff_t off, size_t count)
{
+ struct device *dev = kobj_to_dev(kobj);
+ unsigned int i;
+
+ for (i = 0; i < count; i++)
+ m48t86_writeb(dev, buf[i], M48T86_NVRAM(off + i));
+
+ return count;
+}
+
+static BIN_ATTR(nvram, 0644, m48t86_nvram_read, m48t86_nvram_write,
+ M48T86_NVRAM_LEN);
+
+/*
+ * The RTC is an optional feature at purchase time on some Technologic Systems
+ * boards. Verify that it actually exists by checking if the last two bytes
+ * of the NVRAM can be changed.
+ *
+ * This is based on the method used in their rtc7800.c example.
+ */
+static bool m48t86_verify_chip(struct platform_device *pdev)
+{
+ unsigned int offset0 = M48T86_NVRAM(M48T86_NVRAM_LEN - 2);
+ unsigned int offset1 = M48T86_NVRAM(M48T86_NVRAM_LEN - 1);
+ unsigned char tmp0, tmp1;
+
+ tmp0 = m48t86_readb(&pdev->dev, offset0);
+ tmp1 = m48t86_readb(&pdev->dev, offset1);
+
+ m48t86_writeb(&pdev->dev, 0x00, offset0);
+ m48t86_writeb(&pdev->dev, 0x55, offset1);
+ if (m48t86_readb(&pdev->dev, offset1) == 0x55) {
+ m48t86_writeb(&pdev->dev, 0xaa, offset1);
+ if (m48t86_readb(&pdev->dev, offset1) == 0xaa &&
+ m48t86_readb(&pdev->dev, offset0) == 0x00) {
+ m48t86_writeb(&pdev->dev, tmp0, offset0);
+ m48t86_writeb(&pdev->dev, tmp1, offset1);
+
+ return true;
+ }
+ }
+ return false;
+}
+
+static int m48t86_rtc_probe(struct platform_device *pdev)
+{
+ struct m48t86_rtc_info *info;
+ struct resource *res;
unsigned char reg;
- struct m48t86_ops *ops = dev_get_platdata(&dev->dev);
- struct rtc_device *rtc;
- rtc = devm_rtc_device_register(&dev->dev, "m48t86",
- &m48t86_rtc_ops, THIS_MODULE);
+ info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!res)
+ return -ENODEV;
+ info->index_reg = devm_ioremap_resource(&pdev->dev, res);
+ if (IS_ERR(info->index_reg))
+ return PTR_ERR(info->index_reg);
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+ if (!res)
+ return -ENODEV;
+ info->data_reg = devm_ioremap_resource(&pdev->dev, res);
+ if (IS_ERR(info->data_reg))
+ return PTR_ERR(info->data_reg);
- if (IS_ERR(rtc))
- return PTR_ERR(rtc);
+ dev_set_drvdata(&pdev->dev, info);
+
+ if (!m48t86_verify_chip(pdev)) {
+ dev_info(&pdev->dev, "RTC not present\n");
+ return -ENODEV;
+ }
- platform_set_drvdata(dev, rtc);
+ info->rtc = devm_rtc_device_register(&pdev->dev, "m48t86",
+ &m48t86_rtc_ops, THIS_MODULE);
+ if (IS_ERR(info->rtc))
+ return PTR_ERR(info->rtc);
/* read battery status */
- reg = ops->readbyte(M48T86_REG_D);
- dev_info(&dev->dev, "battery %s\n",
- (reg & M48T86_REG_D_VRT) ? "ok" : "exhausted");
+ reg = m48t86_readb(&pdev->dev, M48T86_D);
+ dev_info(&pdev->dev, "battery %s\n",
+ (reg & M48T86_D_VRT) ? "ok" : "exhausted");
+ if (device_create_bin_file(&pdev->dev, &bin_attr_nvram))
+ dev_err(&pdev->dev, "failed to create nvram sysfs entry\n");
+
+ return 0;
+}
+
+static int m48t86_rtc_remove(struct platform_device *pdev)
+{
+ device_remove_bin_file(&pdev->dev, &bin_attr_nvram);
return 0;
}
@@ -168,6 +281,7 @@ static struct platform_driver m48t86_rtc_platform_driver = {
.name = "rtc-m48t86",
},
.probe = m48t86_rtc_probe,
+ .remove = m48t86_rtc_remove,
};
module_platform_driver(m48t86_rtc_platform_driver);
diff --git a/drivers/rtc/rtc-mcp795.c b/drivers/rtc/rtc-mcp795.c
index ce75e421ba00..77f21331ae21 100644
--- a/drivers/rtc/rtc-mcp795.c
+++ b/drivers/rtc/rtc-mcp795.c
@@ -44,12 +44,22 @@
#define MCP795_REG_DAY 0x04
#define MCP795_REG_MONTH 0x06
#define MCP795_REG_CONTROL 0x08
+#define MCP795_REG_ALM0_SECONDS 0x0C
+#define MCP795_REG_ALM0_DAY 0x0F
#define MCP795_ST_BIT BIT(7)
#define MCP795_24_BIT BIT(6)
#define MCP795_LP_BIT BIT(5)
#define MCP795_EXTOSC_BIT BIT(3)
#define MCP795_OSCON_BIT BIT(5)
+#define MCP795_ALM0_BIT BIT(4)
+#define MCP795_ALM1_BIT BIT(5)
+#define MCP795_ALM0IF_BIT BIT(3)
+#define MCP795_ALM0C0_BIT BIT(4)
+#define MCP795_ALM0C1_BIT BIT(5)
+#define MCP795_ALM0C2_BIT BIT(6)
+
+#define SEC_PER_DAY (24 * 60 * 60)
static int mcp795_rtcc_read(struct device *dev, u8 addr, u8 *buf, u8 count)
{
@@ -150,6 +160,30 @@ static int mcp795_start_oscillator(struct device *dev, bool *extosc)
dev, MCP795_REG_SECONDS, MCP795_ST_BIT, MCP795_ST_BIT);
}
+/* Enable or disable Alarm 0 in RTC */
+static int mcp795_update_alarm(struct device *dev, bool enable)
+{
+ int ret;
+
+ dev_dbg(dev, "%s alarm\n", enable ? "Enable" : "Disable");
+
+ if (enable) {
+ /* clear ALM0IF (Alarm 0 Interrupt Flag) bit */
+ ret = mcp795_rtcc_set_bits(dev, MCP795_REG_ALM0_DAY,
+ MCP795_ALM0IF_BIT, 0);
+ if (ret)
+ return ret;
+ /* enable alarm 0 */
+ ret = mcp795_rtcc_set_bits(dev, MCP795_REG_CONTROL,
+ MCP795_ALM0_BIT, MCP795_ALM0_BIT);
+ } else {
+ /* disable alarm 0 and alarm 1 */
+ ret = mcp795_rtcc_set_bits(dev, MCP795_REG_CONTROL,
+ MCP795_ALM0_BIT | MCP795_ALM1_BIT, 0);
+ }
+ return ret;
+}
+
static int mcp795_set_time(struct device *dev, struct rtc_time *tim)
{
int ret;
@@ -170,6 +204,7 @@ static int mcp795_set_time(struct device *dev, struct rtc_time *tim)
data[0] = (data[0] & 0x80) | bin2bcd(tim->tm_sec);
data[1] = (data[1] & 0x80) | bin2bcd(tim->tm_min);
data[2] = bin2bcd(tim->tm_hour);
+ data[3] = (data[3] & 0xF8) | bin2bcd(tim->tm_wday + 1);
data[4] = bin2bcd(tim->tm_mday);
data[5] = (data[5] & MCP795_LP_BIT) | bin2bcd(tim->tm_mon + 1);
@@ -198,9 +233,9 @@ static int mcp795_set_time(struct device *dev, struct rtc_time *tim)
if (ret)
return ret;
- dev_dbg(dev, "Set mcp795: %04d-%02d-%02d %02d:%02d:%02d\n",
+ dev_dbg(dev, "Set mcp795: %04d-%02d-%02d(%d) %02d:%02d:%02d\n",
tim->tm_year + 1900, tim->tm_mon, tim->tm_mday,
- tim->tm_hour, tim->tm_min, tim->tm_sec);
+ tim->tm_wday, tim->tm_hour, tim->tm_min, tim->tm_sec);
return 0;
}
@@ -218,20 +253,139 @@ static int mcp795_read_time(struct device *dev, struct rtc_time *tim)
tim->tm_sec = bcd2bin(data[0] & 0x7F);
tim->tm_min = bcd2bin(data[1] & 0x7F);
tim->tm_hour = bcd2bin(data[2] & 0x3F);
+ tim->tm_wday = bcd2bin(data[3] & 0x07) - 1;
tim->tm_mday = bcd2bin(data[4] & 0x3F);
tim->tm_mon = bcd2bin(data[5] & 0x1F) - 1;
tim->tm_year = bcd2bin(data[6]) + 100; /* Assume we are in 20xx */
- dev_dbg(dev, "Read from mcp795: %04d-%02d-%02d %02d:%02d:%02d\n",
- tim->tm_year + 1900, tim->tm_mon, tim->tm_mday,
- tim->tm_hour, tim->tm_min, tim->tm_sec);
+ dev_dbg(dev, "Read from mcp795: %04d-%02d-%02d(%d) %02d:%02d:%02d\n",
+ tim->tm_year + 1900, tim->tm_mon, tim->tm_mday,
+ tim->tm_wday, tim->tm_hour, tim->tm_min, tim->tm_sec);
return rtc_valid_tm(tim);
}
+static int mcp795_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
+{
+ struct rtc_time now_tm;
+ time64_t now;
+ time64_t later;
+ u8 tmp[6];
+ int ret;
+
+ /* Read current time from RTC hardware */
+ ret = mcp795_read_time(dev, &now_tm);
+ if (ret)
+ return ret;
+ /* Get the number of seconds since 1970 */
+ now = rtc_tm_to_time64(&now_tm);
+ later = rtc_tm_to_time64(&alm->time);
+ if (later <= now)
+ return -EINVAL;
+ /* make sure alarm fires within the next one year */
+ if ((later - now) >=
+ (SEC_PER_DAY * (365 + is_leap_year(alm->time.tm_year))))
+ return -EDOM;
+ /* disable alarm */
+ ret = mcp795_update_alarm(dev, false);
+ if (ret)
+ return ret;
+ /* Read registers, so we can leave configuration bits untouched */
+ ret = mcp795_rtcc_read(dev, MCP795_REG_ALM0_SECONDS, tmp, sizeof(tmp));
+ if (ret)
+ return ret;
+
+ alm->time.tm_year = -1;
+ alm->time.tm_isdst = -1;
+ alm->time.tm_yday = -1;
+
+ tmp[0] = (tmp[0] & 0x80) | bin2bcd(alm->time.tm_sec);
+ tmp[1] = (tmp[1] & 0x80) | bin2bcd(alm->time.tm_min);
+ tmp[2] = (tmp[2] & 0xE0) | bin2bcd(alm->time.tm_hour);
+ tmp[3] = (tmp[3] & 0x80) | bin2bcd(alm->time.tm_wday + 1);
+ /* set alarm match: seconds, minutes, hour, day, date and month */
+ tmp[3] |= (MCP795_ALM0C2_BIT | MCP795_ALM0C1_BIT | MCP795_ALM0C0_BIT);
+ tmp[4] = (tmp[4] & 0xC0) | bin2bcd(alm->time.tm_mday);
+ tmp[5] = (tmp[5] & 0xE0) | bin2bcd(alm->time.tm_mon + 1);
+
+ ret = mcp795_rtcc_write(dev, MCP795_REG_ALM0_SECONDS, tmp, sizeof(tmp));
+ if (ret)
+ return ret;
+
+ /* enable alarm if requested */
+ if (alm->enabled) {
+ ret = mcp795_update_alarm(dev, true);
+ if (ret)
+ return ret;
+ dev_dbg(dev, "Alarm IRQ armed\n");
+ }
+ dev_dbg(dev, "Set alarm: %02d-%02d(%d) %02d:%02d:%02d\n",
+ alm->time.tm_mon, alm->time.tm_mday, alm->time.tm_wday,
+ alm->time.tm_hour, alm->time.tm_min, alm->time.tm_sec);
+ return 0;
+}
+
+static int mcp795_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
+{
+ u8 data[6];
+ int ret;
+
+ ret = mcp795_rtcc_read(
+ dev, MCP795_REG_ALM0_SECONDS, data, sizeof(data));
+ if (ret)
+ return ret;
+
+ alm->time.tm_sec = bcd2bin(data[0] & 0x7F);
+ alm->time.tm_min = bcd2bin(data[1] & 0x7F);
+ alm->time.tm_hour = bcd2bin(data[2] & 0x1F);
+ alm->time.tm_wday = bcd2bin(data[3] & 0x07) - 1;
+ alm->time.tm_mday = bcd2bin(data[4] & 0x3F);
+ alm->time.tm_mon = bcd2bin(data[5] & 0x1F) - 1;
+ alm->time.tm_year = -1;
+ alm->time.tm_isdst = -1;
+ alm->time.tm_yday = -1;
+
+ dev_dbg(dev, "Read alarm: %02d-%02d(%d) %02d:%02d:%02d\n",
+ alm->time.tm_mon, alm->time.tm_mday, alm->time.tm_wday,
+ alm->time.tm_hour, alm->time.tm_min, alm->time.tm_sec);
+ return 0;
+}
+
+static int mcp795_alarm_irq_enable(struct device *dev, unsigned int enabled)
+{
+ return mcp795_update_alarm(dev, !!enabled);
+}
+
+static irqreturn_t mcp795_irq(int irq, void *data)
+{
+ struct spi_device *spi = data;
+ struct rtc_device *rtc = spi_get_drvdata(spi);
+ struct mutex *lock = &rtc->ops_lock;
+ int ret;
+
+ mutex_lock(lock);
+
+ /* Disable alarm.
+ * There is no need to clear ALM0IF (Alarm 0 Interrupt Flag) bit,
+ * because it is done every time when alarm is enabled.
+ */
+ ret = mcp795_update_alarm(&spi->dev, false);
+ if (ret)
+ dev_err(&spi->dev,
+ "Failed to disable alarm in IRQ (ret=%d)\n", ret);
+ rtc_update_irq(rtc, 1, RTC_AF | RTC_IRQF);
+
+ mutex_unlock(lock);
+
+ return IRQ_HANDLED;
+}
+
static const struct rtc_class_ops mcp795_rtc_ops = {
.read_time = mcp795_read_time,
- .set_time = mcp795_set_time
+ .set_time = mcp795_set_time,
+ .read_alarm = mcp795_read_alarm,
+ .set_alarm = mcp795_set_alarm,
+ .alarm_irq_enable = mcp795_alarm_irq_enable
};
static int mcp795_probe(struct spi_device *spi)
@@ -259,6 +413,23 @@ static int mcp795_probe(struct spi_device *spi)
spi_set_drvdata(spi, rtc);
+ if (spi->irq > 0) {
+ dev_dbg(&spi->dev, "Alarm support enabled\n");
+
+ /* Clear any pending alarm (ALM0IF bit) before requesting
+ * the interrupt.
+ */
+ mcp795_rtcc_set_bits(&spi->dev, MCP795_REG_ALM0_DAY,
+ MCP795_ALM0IF_BIT, 0);
+ ret = devm_request_threaded_irq(&spi->dev, spi->irq, NULL,
+ mcp795_irq, IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
+ dev_name(&rtc->dev), spi);
+ if (ret)
+ dev_err(&spi->dev, "Failed to request IRQ: %d: %d\n",
+ spi->irq, ret);
+ else
+ device_init_wakeup(&spi->dev, true);
+ }
return 0;
}
diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c
index 359876a88ac8..77319122642a 100644
--- a/drivers/rtc/rtc-mxc.c
+++ b/drivers/rtc/rtc-mxc.c
@@ -353,7 +353,7 @@ static int mxc_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
}
/* RTC layer */
-static struct rtc_class_ops mxc_rtc_ops = {
+static const struct rtc_class_ops mxc_rtc_ops = {
.release = mxc_rtc_release,
.read_time = mxc_rtc_read_time,
.set_mmss64 = mxc_rtc_set_mmss,
diff --git a/drivers/rtc/rtc-pcf2127.c b/drivers/rtc/rtc-pcf2127.c
index 2bfdf638b673..f33447c5db85 100644
--- a/drivers/rtc/rtc-pcf2127.c
+++ b/drivers/rtc/rtc-pcf2127.c
@@ -52,9 +52,20 @@ static int pcf2127_rtc_read_time(struct device *dev, struct rtc_time *tm)
struct pcf2127 *pcf2127 = dev_get_drvdata(dev);
unsigned char buf[10];
int ret;
+ int i;
- ret = regmap_bulk_read(pcf2127->regmap, PCF2127_REG_CTRL1, buf,
- sizeof(buf));
+ for (i = 0; i <= PCF2127_REG_CTRL3; i++) {
+ ret = regmap_read(pcf2127->regmap, PCF2127_REG_CTRL1 + i,
+ (unsigned int *)(buf + i));
+ if (ret) {
+ dev_err(dev, "%s: read error\n", __func__);
+ return ret;
+ }
+ }
+
+ ret = regmap_bulk_read(pcf2127->regmap, PCF2127_REG_SC,
+ (buf + PCF2127_REG_SC),
+ ARRAY_SIZE(buf) - PCF2127_REG_SC);
if (ret) {
dev_err(dev, "%s: read error\n", __func__);
return ret;
diff --git a/drivers/rtc/rtc-rx8010.c b/drivers/rtc/rtc-rx8010.c
index 7163b91bb773..d08da371912c 100644
--- a/drivers/rtc/rtc-rx8010.c
+++ b/drivers/rtc/rtc-rx8010.c
@@ -63,7 +63,6 @@ struct rx8010_data {
struct i2c_client *client;
struct rtc_device *rtc;
u8 ctrlreg;
- spinlock_t flags_lock;
};
static irqreturn_t rx8010_irq_1_handler(int irq, void *dev_id)
@@ -72,12 +71,12 @@ static irqreturn_t rx8010_irq_1_handler(int irq, void *dev_id)
struct rx8010_data *rx8010 = i2c_get_clientdata(client);
int flagreg;
- spin_lock(&rx8010->flags_lock);
+ mutex_lock(&rx8010->rtc->ops_lock);
flagreg = i2c_smbus_read_byte_data(client, RX8010_FLAG);
if (flagreg <= 0) {
- spin_unlock(&rx8010->flags_lock);
+ mutex_unlock(&rx8010->rtc->ops_lock);
return IRQ_NONE;
}
@@ -101,7 +100,7 @@ static irqreturn_t rx8010_irq_1_handler(int irq, void *dev_id)
i2c_smbus_write_byte_data(client, RX8010_FLAG, flagreg);
- spin_unlock(&rx8010->flags_lock);
+ mutex_unlock(&rx8010->rtc->ops_lock);
return IRQ_HANDLED;
}
@@ -143,7 +142,6 @@ static int rx8010_set_time(struct device *dev, struct rtc_time *dt)
u8 date[7];
int ctrl, flagreg;
int ret;
- unsigned long irqflags;
if ((dt->tm_year < 100) || (dt->tm_year > 199))
return -EINVAL;
@@ -181,11 +179,8 @@ static int rx8010_set_time(struct device *dev, struct rtc_time *dt)
if (ret < 0)
return ret;
- spin_lock_irqsave(&rx8010->flags_lock, irqflags);
-
flagreg = i2c_smbus_read_byte_data(rx8010->client, RX8010_FLAG);
if (flagreg < 0) {
- spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
return flagreg;
}
@@ -193,8 +188,6 @@ static int rx8010_set_time(struct device *dev, struct rtc_time *dt)
ret = i2c_smbus_write_byte_data(rx8010->client, RX8010_FLAG,
flagreg & ~RX8010_FLAG_VLF);
- spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
-
return 0;
}
@@ -288,12 +281,9 @@ static int rx8010_set_alarm(struct device *dev, struct rtc_wkalrm *t)
u8 alarmvals[3];
int extreg, flagreg;
int err;
- unsigned long irqflags;
- spin_lock_irqsave(&rx8010->flags_lock, irqflags);
flagreg = i2c_smbus_read_byte_data(client, RX8010_FLAG);
if (flagreg < 0) {
- spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
return flagreg;
}
@@ -302,14 +292,12 @@ static int rx8010_set_alarm(struct device *dev, struct rtc_wkalrm *t)
err = i2c_smbus_write_byte_data(rx8010->client, RX8010_CTRL,
rx8010->ctrlreg);
if (err < 0) {
- spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
return err;
}
}
flagreg &= ~RX8010_FLAG_AF;
err = i2c_smbus_write_byte_data(rx8010->client, RX8010_FLAG, flagreg);
- spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
if (err < 0)
return err;
@@ -404,7 +392,6 @@ static int rx8010_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
struct rx8010_data *rx8010 = dev_get_drvdata(dev);
int ret, tmp;
int flagreg;
- unsigned long irqflags;
switch (cmd) {
case RTC_VL_READ:
@@ -419,16 +406,13 @@ static int rx8010_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
return 0;
case RTC_VL_CLR:
- spin_lock_irqsave(&rx8010->flags_lock, irqflags);
flagreg = i2c_smbus_read_byte_data(rx8010->client, RX8010_FLAG);
if (flagreg < 0) {
- spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
return flagreg;
}
flagreg &= ~RX8010_FLAG_VLF;
ret = i2c_smbus_write_byte_data(client, RX8010_FLAG, flagreg);
- spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
if (ret < 0)
return ret;
@@ -466,8 +450,6 @@ static int rx8010_probe(struct i2c_client *client,
rx8010->client = client;
i2c_set_clientdata(client, rx8010);
- spin_lock_init(&rx8010->flags_lock);
-
err = rx8010_init_client(client);
if (err)
return err;
diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c
index 17b6235d67a5..c626e43a9cbb 100644
--- a/drivers/rtc/rtc-sh.c
+++ b/drivers/rtc/rtc-sh.c
@@ -535,7 +535,7 @@ static int sh_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm)
return 0;
}
-static struct rtc_class_ops sh_rtc_ops = {
+static const struct rtc_class_ops sh_rtc_ops = {
.read_time = sh_rtc_read_time,
.set_time = sh_rtc_set_time,
.read_alarm = sh_rtc_read_alarm,
diff --git a/drivers/rtc/rtc-snvs.c b/drivers/rtc/rtc-snvs.c
index 0f11c2a228e3..d51b07d620f7 100644
--- a/drivers/rtc/rtc-snvs.c
+++ b/drivers/rtc/rtc-snvs.c
@@ -184,6 +184,7 @@ static int snvs_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
rtc_tm_to_time(alrm_tm, &time);
regmap_update_bits(data->regmap, data->offset + SNVS_LPCR, SNVS_LPCR_LPTA_EN, 0);
+ rtc_write_sync_lp(data);
regmap_write(data->regmap, data->offset + SNVS_LPTAR, time);
/* Clear alarm interrupt status bit */
diff --git a/drivers/rtc/rtc-stm32.c b/drivers/rtc/rtc-stm32.c
new file mode 100644
index 000000000000..bd57eb1029e1
--- /dev/null
+++ b/drivers/rtc/rtc-stm32.c
@@ -0,0 +1,725 @@
+/*
+ * Copyright (C) Amelie Delaunay 2016
+ * Author: Amelie Delaunay <amelie.delaunay@st.com>
+ * License terms: GNU General Public License (GPL), version 2
+ */
+
+#include <linux/bcd.h>
+#include <linux/clk.h>
+#include <linux/iopoll.h>
+#include <linux/ioport.h>
+#include <linux/mfd/syscon.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/regmap.h>
+#include <linux/rtc.h>
+
+#define DRIVER_NAME "stm32_rtc"
+
+/* STM32 RTC registers */
+#define STM32_RTC_TR 0x00
+#define STM32_RTC_DR 0x04
+#define STM32_RTC_CR 0x08
+#define STM32_RTC_ISR 0x0C
+#define STM32_RTC_PRER 0x10
+#define STM32_RTC_ALRMAR 0x1C
+#define STM32_RTC_WPR 0x24
+
+/* STM32_RTC_TR bit fields */
+#define STM32_RTC_TR_SEC_SHIFT 0
+#define STM32_RTC_TR_SEC GENMASK(6, 0)
+#define STM32_RTC_TR_MIN_SHIFT 8
+#define STM32_RTC_TR_MIN GENMASK(14, 8)
+#define STM32_RTC_TR_HOUR_SHIFT 16
+#define STM32_RTC_TR_HOUR GENMASK(21, 16)
+
+/* STM32_RTC_DR bit fields */
+#define STM32_RTC_DR_DATE_SHIFT 0
+#define STM32_RTC_DR_DATE GENMASK(5, 0)
+#define STM32_RTC_DR_MONTH_SHIFT 8
+#define STM32_RTC_DR_MONTH GENMASK(12, 8)
+#define STM32_RTC_DR_WDAY_SHIFT 13
+#define STM32_RTC_DR_WDAY GENMASK(15, 13)
+#define STM32_RTC_DR_YEAR_SHIFT 16
+#define STM32_RTC_DR_YEAR GENMASK(23, 16)
+
+/* STM32_RTC_CR bit fields */
+#define STM32_RTC_CR_FMT BIT(6)
+#define STM32_RTC_CR_ALRAE BIT(8)
+#define STM32_RTC_CR_ALRAIE BIT(12)
+
+/* STM32_RTC_ISR bit fields */
+#define STM32_RTC_ISR_ALRAWF BIT(0)
+#define STM32_RTC_ISR_INITS BIT(4)
+#define STM32_RTC_ISR_RSF BIT(5)
+#define STM32_RTC_ISR_INITF BIT(6)
+#define STM32_RTC_ISR_INIT BIT(7)
+#define STM32_RTC_ISR_ALRAF BIT(8)
+
+/* STM32_RTC_PRER bit fields */
+#define STM32_RTC_PRER_PRED_S_SHIFT 0
+#define STM32_RTC_PRER_PRED_S GENMASK(14, 0)
+#define STM32_RTC_PRER_PRED_A_SHIFT 16
+#define STM32_RTC_PRER_PRED_A GENMASK(22, 16)
+
+/* STM32_RTC_ALRMAR and STM32_RTC_ALRMBR bit fields */
+#define STM32_RTC_ALRMXR_SEC_SHIFT 0
+#define STM32_RTC_ALRMXR_SEC GENMASK(6, 0)
+#define STM32_RTC_ALRMXR_SEC_MASK BIT(7)
+#define STM32_RTC_ALRMXR_MIN_SHIFT 8
+#define STM32_RTC_ALRMXR_MIN GENMASK(14, 8)
+#define STM32_RTC_ALRMXR_MIN_MASK BIT(15)
+#define STM32_RTC_ALRMXR_HOUR_SHIFT 16
+#define STM32_RTC_ALRMXR_HOUR GENMASK(21, 16)
+#define STM32_RTC_ALRMXR_PM BIT(22)
+#define STM32_RTC_ALRMXR_HOUR_MASK BIT(23)
+#define STM32_RTC_ALRMXR_DATE_SHIFT 24
+#define STM32_RTC_ALRMXR_DATE GENMASK(29, 24)
+#define STM32_RTC_ALRMXR_WDSEL BIT(30)
+#define STM32_RTC_ALRMXR_WDAY_SHIFT 24
+#define STM32_RTC_ALRMXR_WDAY GENMASK(27, 24)
+#define STM32_RTC_ALRMXR_DATE_MASK BIT(31)
+
+/* STM32_RTC_WPR key constants */
+#define RTC_WPR_1ST_KEY 0xCA
+#define RTC_WPR_2ND_KEY 0x53
+#define RTC_WPR_WRONG_KEY 0xFF
+
+/*
+ * RTC registers are protected against parasitic write access.
+ * PWR_CR_DBP bit must be set to enable write access to RTC registers.
+ */
+/* STM32_PWR_CR */
+#define PWR_CR 0x00
+/* STM32_PWR_CR bit field */
+#define PWR_CR_DBP BIT(8)
+
+struct stm32_rtc {
+ struct rtc_device *rtc_dev;
+ void __iomem *base;
+ struct regmap *dbp;
+ struct clk *ck_rtc;
+ int irq_alarm;
+};
+
+static void stm32_rtc_wpr_unlock(struct stm32_rtc *rtc)
+{
+ writel_relaxed(RTC_WPR_1ST_KEY, rtc->base + STM32_RTC_WPR);
+ writel_relaxed(RTC_WPR_2ND_KEY, rtc->base + STM32_RTC_WPR);
+}
+
+static void stm32_rtc_wpr_lock(struct stm32_rtc *rtc)
+{
+ writel_relaxed(RTC_WPR_WRONG_KEY, rtc->base + STM32_RTC_WPR);
+}
+
+static int stm32_rtc_enter_init_mode(struct stm32_rtc *rtc)
+{
+ unsigned int isr = readl_relaxed(rtc->base + STM32_RTC_ISR);
+
+ if (!(isr & STM32_RTC_ISR_INITF)) {
+ isr |= STM32_RTC_ISR_INIT;
+ writel_relaxed(isr, rtc->base + STM32_RTC_ISR);
+
+ /*
+ * It takes around 2 ck_rtc clock cycles to enter in
+ * initialization phase mode (and have INITF flag set). As
+ * slowest ck_rtc frequency may be 32kHz and highest should be
+ * 1MHz, we poll every 10 us with a timeout of 100ms.
+ */
+ return readl_relaxed_poll_timeout_atomic(
+ rtc->base + STM32_RTC_ISR,
+ isr, (isr & STM32_RTC_ISR_INITF),
+ 10, 100000);
+ }
+
+ return 0;
+}
+
+static void stm32_rtc_exit_init_mode(struct stm32_rtc *rtc)
+{
+ unsigned int isr = readl_relaxed(rtc->base + STM32_RTC_ISR);
+
+ isr &= ~STM32_RTC_ISR_INIT;
+ writel_relaxed(isr, rtc->base + STM32_RTC_ISR);
+}
+
+static int stm32_rtc_wait_sync(struct stm32_rtc *rtc)
+{
+ unsigned int isr = readl_relaxed(rtc->base + STM32_RTC_ISR);
+
+ isr &= ~STM32_RTC_ISR_RSF;
+ writel_relaxed(isr, rtc->base + STM32_RTC_ISR);
+
+ /*
+ * Wait for RSF to be set to ensure the calendar registers are
+ * synchronised, it takes around 2 ck_rtc clock cycles
+ */
+ return readl_relaxed_poll_timeout_atomic(rtc->base + STM32_RTC_ISR,
+ isr,
+ (isr & STM32_RTC_ISR_RSF),
+ 10, 100000);
+}
+
+static irqreturn_t stm32_rtc_alarm_irq(int irq, void *dev_id)
+{
+ struct stm32_rtc *rtc = (struct stm32_rtc *)dev_id;
+ unsigned int isr, cr;
+
+ mutex_lock(&rtc->rtc_dev->ops_lock);
+
+ isr = readl_relaxed(rtc->base + STM32_RTC_ISR);
+ cr = readl_relaxed(rtc->base + STM32_RTC_CR);
+
+ if ((isr & STM32_RTC_ISR_ALRAF) &&
+ (cr & STM32_RTC_CR_ALRAIE)) {
+ /* Alarm A flag - Alarm interrupt */
+ dev_dbg(&rtc->rtc_dev->dev, "Alarm occurred\n");
+
+ /* Pass event to the kernel */
+ rtc_update_irq(rtc->rtc_dev, 1, RTC_IRQF | RTC_AF);
+
+ /* Clear event flag, otherwise new events won't be received */
+ writel_relaxed(isr & ~STM32_RTC_ISR_ALRAF,
+ rtc->base + STM32_RTC_ISR);
+ }
+
+ mutex_unlock(&rtc->rtc_dev->ops_lock);
+
+ return IRQ_HANDLED;
+}
+
+/* Convert rtc_time structure from bin to bcd format */
+static void tm2bcd(struct rtc_time *tm)
+{
+ tm->tm_sec = bin2bcd(tm->tm_sec);
+ tm->tm_min = bin2bcd(tm->tm_min);
+ tm->tm_hour = bin2bcd(tm->tm_hour);
+
+ tm->tm_mday = bin2bcd(tm->tm_mday);
+ tm->tm_mon = bin2bcd(tm->tm_mon + 1);
+ tm->tm_year = bin2bcd(tm->tm_year - 100);
+ /*
+ * Number of days since Sunday
+ * - on kernel side, 0=Sunday...6=Saturday
+ * - on rtc side, 0=invalid,1=Monday...7=Sunday
+ */
+ tm->tm_wday = (!tm->tm_wday) ? 7 : tm->tm_wday;
+}
+
+/* Convert rtc_time structure from bcd to bin format */
+static void bcd2tm(struct rtc_time *tm)
+{
+ tm->tm_sec = bcd2bin(tm->tm_sec);
+ tm->tm_min = bcd2bin(tm->tm_min);
+ tm->tm_hour = bcd2bin(tm->tm_hour);
+
+ tm->tm_mday = bcd2bin(tm->tm_mday);
+ tm->tm_mon = bcd2bin(tm->tm_mon) - 1;
+ tm->tm_year = bcd2bin(tm->tm_year) + 100;
+ /*
+ * Number of days since Sunday
+ * - on kernel side, 0=Sunday...6=Saturday
+ * - on rtc side, 0=invalid,1=Monday...7=Sunday
+ */
+ tm->tm_wday %= 7;
+}
+
+static int stm32_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+ struct stm32_rtc *rtc = dev_get_drvdata(dev);
+ unsigned int tr, dr;
+
+ /* Time and Date in BCD format */
+ tr = readl_relaxed(rtc->base + STM32_RTC_TR);
+ dr = readl_relaxed(rtc->base + STM32_RTC_DR);
+
+ tm->tm_sec = (tr & STM32_RTC_TR_SEC) >> STM32_RTC_TR_SEC_SHIFT;
+ tm->tm_min = (tr & STM32_RTC_TR_MIN) >> STM32_RTC_TR_MIN_SHIFT;
+ tm->tm_hour = (tr & STM32_RTC_TR_HOUR) >> STM32_RTC_TR_HOUR_SHIFT;
+
+ tm->tm_mday = (dr & STM32_RTC_DR_DATE) >> STM32_RTC_DR_DATE_SHIFT;
+ tm->tm_mon = (dr & STM32_RTC_DR_MONTH) >> STM32_RTC_DR_MONTH_SHIFT;
+ tm->tm_year = (dr & STM32_RTC_DR_YEAR) >> STM32_RTC_DR_YEAR_SHIFT;
+ tm->tm_wday = (dr & STM32_RTC_DR_WDAY) >> STM32_RTC_DR_WDAY_SHIFT;
+
+ /* We don't report tm_yday and tm_isdst */
+
+ bcd2tm(tm);
+
+ return 0;
+}
+
+static int stm32_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+ struct stm32_rtc *rtc = dev_get_drvdata(dev);
+ unsigned int tr, dr;
+ int ret = 0;
+
+ tm2bcd(tm);
+
+ /* Time in BCD format */
+ tr = ((tm->tm_sec << STM32_RTC_TR_SEC_SHIFT) & STM32_RTC_TR_SEC) |
+ ((tm->tm_min << STM32_RTC_TR_MIN_SHIFT) & STM32_RTC_TR_MIN) |
+ ((tm->tm_hour << STM32_RTC_TR_HOUR_SHIFT) & STM32_RTC_TR_HOUR);
+
+ /* Date in BCD format */
+ dr = ((tm->tm_mday << STM32_RTC_DR_DATE_SHIFT) & STM32_RTC_DR_DATE) |
+ ((tm->tm_mon << STM32_RTC_DR_MONTH_SHIFT) & STM32_RTC_DR_MONTH) |
+ ((tm->tm_year << STM32_RTC_DR_YEAR_SHIFT) & STM32_RTC_DR_YEAR) |
+ ((tm->tm_wday << STM32_RTC_DR_WDAY_SHIFT) & STM32_RTC_DR_WDAY);
+
+ stm32_rtc_wpr_unlock(rtc);
+
+ ret = stm32_rtc_enter_init_mode(rtc);
+ if (ret) {
+ dev_err(dev, "Can't enter in init mode. Set time aborted.\n");
+ goto end;
+ }
+
+ writel_relaxed(tr, rtc->base + STM32_RTC_TR);
+ writel_relaxed(dr, rtc->base + STM32_RTC_DR);
+
+ stm32_rtc_exit_init_mode(rtc);
+
+ ret = stm32_rtc_wait_sync(rtc);
+end:
+ stm32_rtc_wpr_lock(rtc);
+
+ return ret;
+}
+
+static int stm32_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+ struct stm32_rtc *rtc = dev_get_drvdata(dev);
+ struct rtc_time *tm = &alrm->time;
+ unsigned int alrmar, cr, isr;
+
+ alrmar = readl_relaxed(rtc->base + STM32_RTC_ALRMAR);
+ cr = readl_relaxed(rtc->base + STM32_RTC_CR);
+ isr = readl_relaxed(rtc->base + STM32_RTC_ISR);
+
+ if (alrmar & STM32_RTC_ALRMXR_DATE_MASK) {
+ /*
+ * Date/day doesn't matter in Alarm comparison so alarm
+ * triggers every day
+ */
+ tm->tm_mday = -1;
+ tm->tm_wday = -1;
+ } else {
+ if (alrmar & STM32_RTC_ALRMXR_WDSEL) {
+ /* Alarm is set to a day of week */
+ tm->tm_mday = -1;
+ tm->tm_wday = (alrmar & STM32_RTC_ALRMXR_WDAY) >>
+ STM32_RTC_ALRMXR_WDAY_SHIFT;
+ tm->tm_wday %= 7;
+ } else {
+ /* Alarm is set to a day of month */
+ tm->tm_wday = -1;
+ tm->tm_mday = (alrmar & STM32_RTC_ALRMXR_DATE) >>
+ STM32_RTC_ALRMXR_DATE_SHIFT;
+ }
+ }
+
+ if (alrmar & STM32_RTC_ALRMXR_HOUR_MASK) {
+ /* Hours don't matter in Alarm comparison */
+ tm->tm_hour = -1;
+ } else {
+ tm->tm_hour = (alrmar & STM32_RTC_ALRMXR_HOUR) >>
+ STM32_RTC_ALRMXR_HOUR_SHIFT;
+ if (alrmar & STM32_RTC_ALRMXR_PM)
+ tm->tm_hour += 12;
+ }
+
+ if (alrmar & STM32_RTC_ALRMXR_MIN_MASK) {
+ /* Minutes don't matter in Alarm comparison */
+ tm->tm_min = -1;
+ } else {
+ tm->tm_min = (alrmar & STM32_RTC_ALRMXR_MIN) >>
+ STM32_RTC_ALRMXR_MIN_SHIFT;
+ }
+
+ if (alrmar & STM32_RTC_ALRMXR_SEC_MASK) {
+ /* Seconds don't matter in Alarm comparison */
+ tm->tm_sec = -1;
+ } else {
+ tm->tm_sec = (alrmar & STM32_RTC_ALRMXR_SEC) >>
+ STM32_RTC_ALRMXR_SEC_SHIFT;
+ }
+
+ bcd2tm(tm);
+
+ alrm->enabled = (cr & STM32_RTC_CR_ALRAE) ? 1 : 0;
+ alrm->pending = (isr & STM32_RTC_ISR_ALRAF) ? 1 : 0;
+
+ return 0;
+}
+
+static int stm32_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
+{
+ struct stm32_rtc *rtc = dev_get_drvdata(dev);
+ unsigned int isr, cr;
+
+ cr = readl_relaxed(rtc->base + STM32_RTC_CR);
+
+ stm32_rtc_wpr_unlock(rtc);
+
+ /* We expose Alarm A to the kernel */
+ if (enabled)
+ cr |= (STM32_RTC_CR_ALRAIE | STM32_RTC_CR_ALRAE);
+ else
+ cr &= ~(STM32_RTC_CR_ALRAIE | STM32_RTC_CR_ALRAE);
+ writel_relaxed(cr, rtc->base + STM32_RTC_CR);
+
+ /* Clear event flag, otherwise new events won't be received */
+ isr = readl_relaxed(rtc->base + STM32_RTC_ISR);
+ isr &= ~STM32_RTC_ISR_ALRAF;
+ writel_relaxed(isr, rtc->base + STM32_RTC_ISR);
+
+ stm32_rtc_wpr_lock(rtc);
+
+ return 0;
+}
+
+static int stm32_rtc_valid_alrm(struct stm32_rtc *rtc, struct rtc_time *tm)
+{
+ int cur_day, cur_mon, cur_year, cur_hour, cur_min, cur_sec;
+ unsigned int dr = readl_relaxed(rtc->base + STM32_RTC_DR);
+ unsigned int tr = readl_relaxed(rtc->base + STM32_RTC_TR);
+
+ cur_day = (dr & STM32_RTC_DR_DATE) >> STM32_RTC_DR_DATE_SHIFT;
+ cur_mon = (dr & STM32_RTC_DR_MONTH) >> STM32_RTC_DR_MONTH_SHIFT;
+ cur_year = (dr & STM32_RTC_DR_YEAR) >> STM32_RTC_DR_YEAR_SHIFT;
+ cur_sec = (tr & STM32_RTC_TR_SEC) >> STM32_RTC_TR_SEC_SHIFT;
+ cur_min = (tr & STM32_RTC_TR_MIN) >> STM32_RTC_TR_MIN_SHIFT;
+ cur_hour = (tr & STM32_RTC_TR_HOUR) >> STM32_RTC_TR_HOUR_SHIFT;
+
+ /*
+ * Assuming current date is M-D-Y H:M:S.
+ * RTC alarm can't be set on a specific month and year.
+ * So the valid alarm range is:
+ * M-D-Y H:M:S < alarm <= (M+1)-D-Y H:M:S
+ * with a specific case for December...
+ */
+ if ((((tm->tm_year > cur_year) &&
+ (tm->tm_mon == 0x1) && (cur_mon == 0x12)) ||
+ ((tm->tm_year == cur_year) &&
+ (tm->tm_mon <= cur_mon + 1))) &&
+ ((tm->tm_mday > cur_day) ||
+ ((tm->tm_mday == cur_day) &&
+ ((tm->tm_hour > cur_hour) ||
+ ((tm->tm_hour == cur_hour) && (tm->tm_min > cur_min)) ||
+ ((tm->tm_hour == cur_hour) && (tm->tm_min == cur_min) &&
+ (tm->tm_sec >= cur_sec))))))
+ return 0;
+
+ return -EINVAL;
+}
+
+static int stm32_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+ struct stm32_rtc *rtc = dev_get_drvdata(dev);
+ struct rtc_time *tm = &alrm->time;
+ unsigned int cr, isr, alrmar;
+ int ret = 0;
+
+ tm2bcd(tm);
+
+ /*
+ * RTC alarm can't be set on a specific date, unless this date is
+ * up to the same day of month next month.
+ */
+ if (stm32_rtc_valid_alrm(rtc, tm) < 0) {
+ dev_err(dev, "Alarm can be set only on upcoming month.\n");
+ return -EINVAL;
+ }
+
+ alrmar = 0;
+ /* tm_year and tm_mon are not used because not supported by RTC */
+ alrmar |= (tm->tm_mday << STM32_RTC_ALRMXR_DATE_SHIFT) &
+ STM32_RTC_ALRMXR_DATE;
+ /* 24-hour format */
+ alrmar &= ~STM32_RTC_ALRMXR_PM;
+ alrmar |= (tm->tm_hour << STM32_RTC_ALRMXR_HOUR_SHIFT) &
+ STM32_RTC_ALRMXR_HOUR;
+ alrmar |= (tm->tm_min << STM32_RTC_ALRMXR_MIN_SHIFT) &
+ STM32_RTC_ALRMXR_MIN;
+ alrmar |= (tm->tm_sec << STM32_RTC_ALRMXR_SEC_SHIFT) &
+ STM32_RTC_ALRMXR_SEC;
+
+ stm32_rtc_wpr_unlock(rtc);
+
+ /* Disable Alarm */
+ cr = readl_relaxed(rtc->base + STM32_RTC_CR);
+ cr &= ~STM32_RTC_CR_ALRAE;
+ writel_relaxed(cr, rtc->base + STM32_RTC_CR);
+
+ /*
+ * Poll Alarm write flag to be sure that Alarm update is allowed: it
+ * takes around 2 ck_rtc clock cycles
+ */
+ ret = readl_relaxed_poll_timeout_atomic(rtc->base + STM32_RTC_ISR,
+ isr,
+ (isr & STM32_RTC_ISR_ALRAWF),
+ 10, 100000);
+
+ if (ret) {
+ dev_err(dev, "Alarm update not allowed\n");
+ goto end;
+ }
+
+ /* Write to Alarm register */
+ writel_relaxed(alrmar, rtc->base + STM32_RTC_ALRMAR);
+
+ if (alrm->enabled)
+ stm32_rtc_alarm_irq_enable(dev, 1);
+ else
+ stm32_rtc_alarm_irq_enable(dev, 0);
+
+end:
+ stm32_rtc_wpr_lock(rtc);
+
+ return ret;
+}
+
+static const struct rtc_class_ops stm32_rtc_ops = {
+ .read_time = stm32_rtc_read_time,
+ .set_time = stm32_rtc_set_time,
+ .read_alarm = stm32_rtc_read_alarm,
+ .set_alarm = stm32_rtc_set_alarm,
+ .alarm_irq_enable = stm32_rtc_alarm_irq_enable,
+};
+
+static const struct of_device_id stm32_rtc_of_match[] = {
+ { .compatible = "st,stm32-rtc" },
+ {}
+};
+MODULE_DEVICE_TABLE(of, stm32_rtc_of_match);
+
+static int stm32_rtc_init(struct platform_device *pdev,
+ struct stm32_rtc *rtc)
+{
+ unsigned int prer, pred_a, pred_s, pred_a_max, pred_s_max, cr;
+ unsigned int rate;
+ int ret = 0;
+
+ rate = clk_get_rate(rtc->ck_rtc);
+
+ /* Find prediv_a and prediv_s to obtain the 1Hz calendar clock */
+ pred_a_max = STM32_RTC_PRER_PRED_A >> STM32_RTC_PRER_PRED_A_SHIFT;
+ pred_s_max = STM32_RTC_PRER_PRED_S >> STM32_RTC_PRER_PRED_S_SHIFT;
+
+ for (pred_a = pred_a_max; pred_a + 1 > 0; pred_a--) {
+ pred_s = (rate / (pred_a + 1)) - 1;
+
+ if (((pred_s + 1) * (pred_a + 1)) == rate)
+ break;
+ }
+
+ /*
+ * Can't find a 1Hz, so give priority to RTC power consumption
+ * by choosing the higher possible value for prediv_a
+ */
+ if ((pred_s > pred_s_max) || (pred_a > pred_a_max)) {
+ pred_a = pred_a_max;
+ pred_s = (rate / (pred_a + 1)) - 1;
+
+ dev_warn(&pdev->dev, "ck_rtc is %s\n",
+ (rate < ((pred_a + 1) * (pred_s + 1))) ?
+ "fast" : "slow");
+ }
+
+ stm32_rtc_wpr_unlock(rtc);
+
+ ret = stm32_rtc_enter_init_mode(rtc);
+ if (ret) {
+ dev_err(&pdev->dev,
+ "Can't enter in init mode. Prescaler config failed.\n");
+ goto end;
+ }
+
+ prer = (pred_s << STM32_RTC_PRER_PRED_S_SHIFT) & STM32_RTC_PRER_PRED_S;
+ writel_relaxed(prer, rtc->base + STM32_RTC_PRER);
+ prer |= (pred_a << STM32_RTC_PRER_PRED_A_SHIFT) & STM32_RTC_PRER_PRED_A;
+ writel_relaxed(prer, rtc->base + STM32_RTC_PRER);
+
+ /* Force 24h time format */
+ cr = readl_relaxed(rtc->base + STM32_RTC_CR);
+ cr &= ~STM32_RTC_CR_FMT;
+ writel_relaxed(cr, rtc->base + STM32_RTC_CR);
+
+ stm32_rtc_exit_init_mode(rtc);
+
+ ret = stm32_rtc_wait_sync(rtc);
+end:
+ stm32_rtc_wpr_lock(rtc);
+
+ return ret;
+}
+
+static int stm32_rtc_probe(struct platform_device *pdev)
+{
+ struct stm32_rtc *rtc;
+ struct resource *res;
+ int ret;
+
+ rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL);
+ if (!rtc)
+ return -ENOMEM;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ rtc->base = devm_ioremap_resource(&pdev->dev, res);
+ if (IS_ERR(rtc->base))
+ return PTR_ERR(rtc->base);
+
+ rtc->dbp = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
+ "st,syscfg");
+ if (IS_ERR(rtc->dbp)) {
+ dev_err(&pdev->dev, "no st,syscfg\n");
+ return PTR_ERR(rtc->dbp);
+ }
+
+ rtc->ck_rtc = devm_clk_get(&pdev->dev, NULL);
+ if (IS_ERR(rtc->ck_rtc)) {
+ dev_err(&pdev->dev, "no ck_rtc clock");
+ return PTR_ERR(rtc->ck_rtc);
+ }
+
+ ret = clk_prepare_enable(rtc->ck_rtc);
+ if (ret)
+ return ret;
+
+ regmap_update_bits(rtc->dbp, PWR_CR, PWR_CR_DBP, PWR_CR_DBP);
+
+ /*
+ * After a system reset, RTC_ISR.INITS flag can be read to check if
+ * the calendar has been initalized or not. INITS flag is reset by a
+ * power-on reset (no vbat, no power-supply). It is not reset if
+ * ck_rtc parent clock has changed (so RTC prescalers need to be
+ * changed). That's why we cannot rely on this flag to know if RTC
+ * init has to be done.
+ */
+ ret = stm32_rtc_init(pdev, rtc);
+ if (ret)
+ goto err;
+
+ rtc->irq_alarm = platform_get_irq(pdev, 0);
+ if (rtc->irq_alarm <= 0) {
+ dev_err(&pdev->dev, "no alarm irq\n");
+ ret = rtc->irq_alarm;
+ goto err;
+ }
+
+ platform_set_drvdata(pdev, rtc);
+
+ ret = device_init_wakeup(&pdev->dev, true);
+ if (ret)
+ dev_warn(&pdev->dev,
+ "alarm won't be able to wake up the system");
+
+ rtc->rtc_dev = devm_rtc_device_register(&pdev->dev, pdev->name,
+ &stm32_rtc_ops, THIS_MODULE);
+ if (IS_ERR(rtc->rtc_dev)) {
+ ret = PTR_ERR(rtc->rtc_dev);
+ dev_err(&pdev->dev, "rtc device registration failed, err=%d\n",
+ ret);
+ goto err;
+ }
+
+ /* Handle RTC alarm interrupts */
+ ret = devm_request_threaded_irq(&pdev->dev, rtc->irq_alarm, NULL,
+ stm32_rtc_alarm_irq,
+ IRQF_TRIGGER_RISING | IRQF_ONESHOT,
+ pdev->name, rtc);
+ if (ret) {
+ dev_err(&pdev->dev, "IRQ%d (alarm interrupt) already claimed\n",
+ rtc->irq_alarm);
+ goto err;
+ }
+
+ /*
+ * If INITS flag is reset (calendar year field set to 0x00), calendar
+ * must be initialized
+ */
+ if (!(readl_relaxed(rtc->base + STM32_RTC_ISR) & STM32_RTC_ISR_INITS))
+ dev_warn(&pdev->dev, "Date/Time must be initialized\n");
+
+ return 0;
+err:
+ clk_disable_unprepare(rtc->ck_rtc);
+
+ regmap_update_bits(rtc->dbp, PWR_CR, PWR_CR_DBP, 0);
+
+ device_init_wakeup(&pdev->dev, false);
+
+ return ret;
+}
+
+static int stm32_rtc_remove(struct platform_device *pdev)
+{
+ struct stm32_rtc *rtc = platform_get_drvdata(pdev);
+ unsigned int cr;
+
+ /* Disable interrupts */
+ stm32_rtc_wpr_unlock(rtc);
+ cr = readl_relaxed(rtc->base + STM32_RTC_CR);
+ cr &= ~STM32_RTC_CR_ALRAIE;
+ writel_relaxed(cr, rtc->base + STM32_RTC_CR);
+ stm32_rtc_wpr_lock(rtc);
+
+ clk_disable_unprepare(rtc->ck_rtc);
+
+ /* Enable backup domain write protection */
+ regmap_update_bits(rtc->dbp, PWR_CR, PWR_CR_DBP, 0);
+
+ device_init_wakeup(&pdev->dev, false);
+
+ return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int stm32_rtc_suspend(struct device *dev)
+{
+ struct stm32_rtc *rtc = dev_get_drvdata(dev);
+
+ if (device_may_wakeup(dev))
+ return enable_irq_wake(rtc->irq_alarm);
+
+ return 0;
+}
+
+static int stm32_rtc_resume(struct device *dev)
+{
+ struct stm32_rtc *rtc = dev_get_drvdata(dev);
+ int ret = 0;
+
+ ret = stm32_rtc_wait_sync(rtc);
+ if (ret < 0)
+ return ret;
+
+ if (device_may_wakeup(dev))
+ return disable_irq_wake(rtc->irq_alarm);
+
+ return ret;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(stm32_rtc_pm_ops,
+ stm32_rtc_suspend, stm32_rtc_resume);
+
+static struct platform_driver stm32_rtc_driver = {
+ .probe = stm32_rtc_probe,
+ .remove = stm32_rtc_remove,
+ .driver = {
+ .name = DRIVER_NAME,
+ .pm = &stm32_rtc_pm_ops,
+ .of_match_table = stm32_rtc_of_match,
+ },
+};
+
+module_platform_driver(stm32_rtc_driver);
+
+MODULE_ALIAS("platform:" DRIVER_NAME);
+MODULE_AUTHOR("Amelie Delaunay <amelie.delaunay@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics STM32 Real Time Clock driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/rtc/rtc-sun6i.c b/drivers/rtc/rtc-sun6i.c
index c169a2cd4727..39cbc1238b92 100644
--- a/drivers/rtc/rtc-sun6i.c
+++ b/drivers/rtc/rtc-sun6i.c
@@ -20,6 +20,8 @@
* more details.
*/
+#include <linux/clk.h>
+#include <linux/clk-provider.h>
#include <linux/delay.h>
#include <linux/err.h>
#include <linux/fs.h>
@@ -33,15 +35,20 @@
#include <linux/of_device.h>
#include <linux/platform_device.h>
#include <linux/rtc.h>
+#include <linux/slab.h>
#include <linux/types.h>
/* Control register */
#define SUN6I_LOSC_CTRL 0x0000
+#define SUN6I_LOSC_CTRL_KEY (0x16aa << 16)
#define SUN6I_LOSC_CTRL_ALM_DHMS_ACC BIT(9)
#define SUN6I_LOSC_CTRL_RTC_HMS_ACC BIT(8)
#define SUN6I_LOSC_CTRL_RTC_YMD_ACC BIT(7)
+#define SUN6I_LOSC_CTRL_EXT_OSC BIT(0)
#define SUN6I_LOSC_CTRL_ACC_MASK GENMASK(9, 7)
+#define SUN6I_LOSC_CLK_PRESCAL 0x0008
+
/* RTC */
#define SUN6I_RTC_YMD 0x0010
#define SUN6I_RTC_HMS 0x0014
@@ -114,13 +121,142 @@ struct sun6i_rtc_dev {
void __iomem *base;
int irq;
unsigned long alarm;
+
+ struct clk_hw hw;
+ struct clk_hw *int_osc;
+ struct clk *losc;
+
+ spinlock_t lock;
+};
+
+static struct sun6i_rtc_dev *sun6i_rtc;
+
+static unsigned long sun6i_rtc_osc_recalc_rate(struct clk_hw *hw,
+ unsigned long parent_rate)
+{
+ struct sun6i_rtc_dev *rtc = container_of(hw, struct sun6i_rtc_dev, hw);
+ u32 val;
+
+ val = readl(rtc->base + SUN6I_LOSC_CTRL);
+ if (val & SUN6I_LOSC_CTRL_EXT_OSC)
+ return parent_rate;
+
+ val = readl(rtc->base + SUN6I_LOSC_CLK_PRESCAL);
+ val &= GENMASK(4, 0);
+
+ return parent_rate / (val + 1);
+}
+
+static u8 sun6i_rtc_osc_get_parent(struct clk_hw *hw)
+{
+ struct sun6i_rtc_dev *rtc = container_of(hw, struct sun6i_rtc_dev, hw);
+
+ return readl(rtc->base + SUN6I_LOSC_CTRL) & SUN6I_LOSC_CTRL_EXT_OSC;
+}
+
+static int sun6i_rtc_osc_set_parent(struct clk_hw *hw, u8 index)
+{
+ struct sun6i_rtc_dev *rtc = container_of(hw, struct sun6i_rtc_dev, hw);
+ unsigned long flags;
+ u32 val;
+
+ if (index > 1)
+ return -EINVAL;
+
+ spin_lock_irqsave(&rtc->lock, flags);
+ val = readl(rtc->base + SUN6I_LOSC_CTRL);
+ val &= ~SUN6I_LOSC_CTRL_EXT_OSC;
+ val |= SUN6I_LOSC_CTRL_KEY;
+ val |= index ? SUN6I_LOSC_CTRL_EXT_OSC : 0;
+ writel(val, rtc->base + SUN6I_LOSC_CTRL);
+ spin_unlock_irqrestore(&rtc->lock, flags);
+
+ return 0;
+}
+
+static const struct clk_ops sun6i_rtc_osc_ops = {
+ .recalc_rate = sun6i_rtc_osc_recalc_rate,
+
+ .get_parent = sun6i_rtc_osc_get_parent,
+ .set_parent = sun6i_rtc_osc_set_parent,
};
+static void __init sun6i_rtc_clk_init(struct device_node *node)
+{
+ struct clk_hw_onecell_data *clk_data;
+ struct sun6i_rtc_dev *rtc;
+ struct clk_init_data init = {
+ .ops = &sun6i_rtc_osc_ops,
+ };
+ const char *parents[2];
+
+ rtc = kzalloc(sizeof(*rtc), GFP_KERNEL);
+ if (!rtc)
+ return;
+ spin_lock_init(&rtc->lock);
+
+ clk_data = kzalloc(sizeof(*clk_data) + sizeof(*clk_data->hws),
+ GFP_KERNEL);
+ if (!clk_data)
+ return;
+ spin_lock_init(&rtc->lock);
+
+ rtc->base = of_io_request_and_map(node, 0, of_node_full_name(node));
+ if (IS_ERR(rtc->base)) {
+ pr_crit("Can't map RTC registers");
+ return;
+ }
+
+ /* Switch to the external, more precise, oscillator */
+ writel(SUN6I_LOSC_CTRL_KEY | SUN6I_LOSC_CTRL_EXT_OSC,
+ rtc->base + SUN6I_LOSC_CTRL);
+
+ /* Yes, I know, this is ugly. */
+ sun6i_rtc = rtc;
+
+ /* Deal with old DTs */
+ if (!of_get_property(node, "clocks", NULL))
+ return;
+
+ rtc->int_osc = clk_hw_register_fixed_rate_with_accuracy(NULL,
+ "rtc-int-osc",
+ NULL, 0,
+ 667000,
+ 300000000);
+ if (IS_ERR(rtc->int_osc)) {
+ pr_crit("Couldn't register the internal oscillator\n");
+ return;
+ }
+
+ parents[0] = clk_hw_get_name(rtc->int_osc);
+ parents[1] = of_clk_get_parent_name(node, 0);
+
+ rtc->hw.init = &init;
+
+ init.parent_names = parents;
+ init.num_parents = of_clk_get_parent_count(node) + 1;
+ of_property_read_string(node, "clock-output-names", &init.name);
+
+ rtc->losc = clk_register(NULL, &rtc->hw);
+ if (IS_ERR(rtc->losc)) {
+ pr_crit("Couldn't register the LOSC clock\n");
+ return;
+ }
+
+ clk_data->num = 1;
+ clk_data->hws[0] = &rtc->hw;
+ of_clk_add_hw_provider(node, of_clk_hw_onecell_get, clk_data);
+}
+CLK_OF_DECLARE_DRIVER(sun6i_rtc_clk, "allwinner,sun6i-a31-rtc",
+ sun6i_rtc_clk_init);
+
static irqreturn_t sun6i_rtc_alarmirq(int irq, void *id)
{
struct sun6i_rtc_dev *chip = (struct sun6i_rtc_dev *) id;
+ irqreturn_t ret = IRQ_NONE;
u32 val;
+ spin_lock(&chip->lock);
val = readl(chip->base + SUN6I_ALRM_IRQ_STA);
if (val & SUN6I_ALRM_IRQ_STA_CNT_IRQ_PEND) {
@@ -129,10 +265,11 @@ static irqreturn_t sun6i_rtc_alarmirq(int irq, void *id)
rtc_update_irq(chip->rtc, 1, RTC_AF | RTC_IRQF);
- return IRQ_HANDLED;
+ ret = IRQ_HANDLED;
}
+ spin_unlock(&chip->lock);
- return IRQ_NONE;
+ return ret;
}
static void sun6i_rtc_setaie(int to, struct sun6i_rtc_dev *chip)
@@ -140,6 +277,7 @@ static void sun6i_rtc_setaie(int to, struct sun6i_rtc_dev *chip)
u32 alrm_val = 0;
u32 alrm_irq_val = 0;
u32 alrm_wake_val = 0;
+ unsigned long flags;
if (to) {
alrm_val = SUN6I_ALRM_EN_CNT_EN;
@@ -150,9 +288,11 @@ static void sun6i_rtc_setaie(int to, struct sun6i_rtc_dev *chip)
chip->base + SUN6I_ALRM_IRQ_STA);
}
+ spin_lock_irqsave(&chip->lock, flags);
writel(alrm_val, chip->base + SUN6I_ALRM_EN);
writel(alrm_irq_val, chip->base + SUN6I_ALRM_IRQ_EN);
writel(alrm_wake_val, chip->base + SUN6I_ALARM_CONFIG);
+ spin_unlock_irqrestore(&chip->lock, flags);
}
static int sun6i_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm)
@@ -191,11 +331,15 @@ static int sun6i_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm)
static int sun6i_rtc_getalarm(struct device *dev, struct rtc_wkalrm *wkalrm)
{
struct sun6i_rtc_dev *chip = dev_get_drvdata(dev);
+ unsigned long flags;
u32 alrm_st;
u32 alrm_en;
+ spin_lock_irqsave(&chip->lock, flags);
alrm_en = readl(chip->base + SUN6I_ALRM_IRQ_EN);
alrm_st = readl(chip->base + SUN6I_ALRM_IRQ_STA);
+ spin_unlock_irqrestore(&chip->lock, flags);
+
wkalrm->enabled = !!(alrm_en & SUN6I_ALRM_EN_CNT_EN);
wkalrm->pending = !!(alrm_st & SUN6I_ALRM_EN_CNT_EN);
rtc_time_to_tm(chip->alarm, &wkalrm->time);
@@ -349,22 +493,15 @@ static const struct rtc_class_ops sun6i_rtc_ops = {
static int sun6i_rtc_probe(struct platform_device *pdev)
{
- struct sun6i_rtc_dev *chip;
- struct resource *res;
+ struct sun6i_rtc_dev *chip = sun6i_rtc;
int ret;
- chip = devm_kzalloc(&pdev->dev, sizeof(*chip), GFP_KERNEL);
if (!chip)
- return -ENOMEM;
+ return -ENODEV;
platform_set_drvdata(pdev, chip);
chip->dev = &pdev->dev;
- res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- chip->base = devm_ioremap_resource(&pdev->dev, res);
- if (IS_ERR(chip->base))
- return PTR_ERR(chip->base);
-
chip->irq = platform_get_irq(pdev, 0);
if (chip->irq < 0) {
dev_err(&pdev->dev, "No IRQ resource\n");
@@ -404,8 +541,10 @@ static int sun6i_rtc_probe(struct platform_device *pdev)
/* disable alarm wakeup */
writel(0, chip->base + SUN6I_ALARM_CONFIG);
- chip->rtc = rtc_device_register("rtc-sun6i", &pdev->dev,
- &sun6i_rtc_ops, THIS_MODULE);
+ clk_prepare_enable(chip->losc);
+
+ chip->rtc = devm_rtc_device_register(&pdev->dev, "rtc-sun6i",
+ &sun6i_rtc_ops, THIS_MODULE);
if (IS_ERR(chip->rtc)) {
dev_err(&pdev->dev, "unable to register device\n");
return PTR_ERR(chip->rtc);
@@ -416,15 +555,6 @@ static int sun6i_rtc_probe(struct platform_device *pdev)
return 0;
}
-static int sun6i_rtc_remove(struct platform_device *pdev)
-{
- struct sun6i_rtc_dev *chip = platform_get_drvdata(pdev);
-
- rtc_device_unregister(chip->rtc);
-
- return 0;
-}
-
static const struct of_device_id sun6i_rtc_dt_ids[] = {
{ .compatible = "allwinner,sun6i-a31-rtc" },
{ /* sentinel */ },
@@ -433,15 +563,9 @@ MODULE_DEVICE_TABLE(of, sun6i_rtc_dt_ids);
static struct platform_driver sun6i_rtc_driver = {
.probe = sun6i_rtc_probe,
- .remove = sun6i_rtc_remove,
.driver = {
.name = "sun6i-rtc",
.of_match_table = sun6i_rtc_dt_ids,
},
};
-
-module_platform_driver(sun6i_rtc_driver);
-
-MODULE_DESCRIPTION("sun6i RTC driver");
-MODULE_AUTHOR("Chen-Yu Tsai <wens@csie.org>");
-MODULE_LICENSE("GPL");
+builtin_platform_driver(sun6i_rtc_driver);
diff --git a/drivers/rtc/rtc-tegra.c b/drivers/rtc/rtc-tegra.c
index 3853ba963bb5..d30d57b048d3 100644
--- a/drivers/rtc/rtc-tegra.c
+++ b/drivers/rtc/rtc-tegra.c
@@ -17,16 +17,18 @@
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
-#include <linux/kernel.h>
+
+#include <linux/clk.h>
+#include <linux/delay.h>
#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/irq.h>
#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/rtc.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
#include <linux/platform_device.h>
#include <linux/pm.h>
+#include <linux/rtc.h>
+#include <linux/slab.h>
/* set to 1 = busy every eight 32kHz clocks during copy of sec+msec to AHB */
#define TEGRA_RTC_REG_BUSY 0x004
@@ -59,6 +61,7 @@ struct tegra_rtc_info {
struct platform_device *pdev;
struct rtc_device *rtc_dev;
void __iomem *rtc_base; /* NULL if not initialized. */
+ struct clk *clk;
int tegra_rtc_irq; /* alarm and periodic irq */
spinlock_t tegra_rtc_lock;
};
@@ -326,6 +329,14 @@ static int __init tegra_rtc_probe(struct platform_device *pdev)
if (info->tegra_rtc_irq <= 0)
return -EBUSY;
+ info->clk = devm_clk_get(&pdev->dev, NULL);
+ if (IS_ERR(info->clk))
+ return PTR_ERR(info->clk);
+
+ ret = clk_prepare_enable(info->clk);
+ if (ret < 0)
+ return ret;
+
/* set context info. */
info->pdev = pdev;
spin_lock_init(&info->tegra_rtc_lock);
@@ -346,7 +357,7 @@ static int __init tegra_rtc_probe(struct platform_device *pdev)
ret = PTR_ERR(info->rtc_dev);
dev_err(&pdev->dev, "Unable to register device (err=%d).\n",
ret);
- return ret;
+ goto disable_clk;
}
ret = devm_request_irq(&pdev->dev, info->tegra_rtc_irq,
@@ -356,12 +367,25 @@ static int __init tegra_rtc_probe(struct platform_device *pdev)
dev_err(&pdev->dev,
"Unable to request interrupt for device (err=%d).\n",
ret);
- return ret;
+ goto disable_clk;
}
dev_notice(&pdev->dev, "Tegra internal Real Time Clock\n");
return 0;
+
+disable_clk:
+ clk_disable_unprepare(info->clk);
+ return ret;
+}
+
+static int tegra_rtc_remove(struct platform_device *pdev)
+{
+ struct tegra_rtc_info *info = platform_get_drvdata(pdev);
+
+ clk_disable_unprepare(info->clk);
+
+ return 0;
}
#ifdef CONFIG_PM_SLEEP
@@ -413,6 +437,7 @@ static void tegra_rtc_shutdown(struct platform_device *pdev)
MODULE_ALIAS("platform:tegra_rtc");
static struct platform_driver tegra_rtc_driver = {
+ .remove = tegra_rtc_remove,
.shutdown = tegra_rtc_shutdown,
.driver = {
.name = "tegra_rtc",
diff --git a/drivers/rtc/rtc-tps65910.c b/drivers/rtc/rtc-tps65910.c
index 5a3d53caa485..d0244d7979fc 100644
--- a/drivers/rtc/rtc-tps65910.c
+++ b/drivers/rtc/rtc-tps65910.c
@@ -21,6 +21,7 @@
#include <linux/types.h>
#include <linux/rtc.h>
#include <linux/bcd.h>
+#include <linux/math64.h>
#include <linux/platform_device.h>
#include <linux/interrupt.h>
#include <linux/mfd/tps65910.h>
@@ -33,7 +34,21 @@ struct tps65910_rtc {
/* Total number of RTC registers needed to set time*/
#define NUM_TIME_REGS (TPS65910_YEARS - TPS65910_SECONDS + 1)
-static int tps65910_rtc_alarm_irq_enable(struct device *dev, unsigned enabled)
+/* Total number of RTC registers needed to set compensation registers */
+#define NUM_COMP_REGS (TPS65910_RTC_COMP_MSB - TPS65910_RTC_COMP_LSB + 1)
+
+/* Min and max values supported with 'offset' interface (swapped sign) */
+#define MIN_OFFSET (-277761)
+#define MAX_OFFSET (277778)
+
+/* Number of ticks per hour */
+#define TICKS_PER_HOUR (32768 * 3600)
+
+/* Multiplier for ppb conversions */
+#define PPB_MULT (1000000000LL)
+
+static int tps65910_rtc_alarm_irq_enable(struct device *dev,
+ unsigned int enabled)
{
struct tps65910 *tps = dev_get_drvdata(dev->parent);
u8 val = 0;
@@ -187,6 +202,133 @@ static int tps65910_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
return ret;
}
+static int tps65910_rtc_set_calibration(struct device *dev, int calibration)
+{
+ unsigned char comp_data[NUM_COMP_REGS];
+ struct tps65910 *tps = dev_get_drvdata(dev->parent);
+ s16 value;
+ int ret;
+
+ /*
+ * TPS65910 uses two's complement 16 bit value for compensation for RTC
+ * crystal inaccuracies. One time every hour when seconds counter
+ * increments from 0 to 1 compensation value will be added to internal
+ * RTC counter value.
+ *
+ * Compensation value 0x7FFF is prohibited value.
+ *
+ * Valid range for compensation value: [-32768 .. 32766]
+ */
+ if ((calibration < -32768) || (calibration > 32766)) {
+ dev_err(dev, "RTC calibration value out of range: %d\n",
+ calibration);
+ return -EINVAL;
+ }
+
+ value = (s16)calibration;
+
+ comp_data[0] = (u16)value & 0xFF;
+ comp_data[1] = ((u16)value >> 8) & 0xFF;
+
+ /* Update all the compensation registers in one shot */
+ ret = regmap_bulk_write(tps->regmap, TPS65910_RTC_COMP_LSB,
+ comp_data, NUM_COMP_REGS);
+ if (ret < 0) {
+ dev_err(dev, "rtc_set_calibration error: %d\n", ret);
+ return ret;
+ }
+
+ /* Enable automatic compensation */
+ ret = regmap_update_bits(tps->regmap, TPS65910_RTC_CTRL,
+ TPS65910_RTC_CTRL_AUTO_COMP, TPS65910_RTC_CTRL_AUTO_COMP);
+ if (ret < 0)
+ dev_err(dev, "auto_comp enable failed with error: %d\n", ret);
+
+ return ret;
+}
+
+static int tps65910_rtc_get_calibration(struct device *dev, int *calibration)
+{
+ unsigned char comp_data[NUM_COMP_REGS];
+ struct tps65910 *tps = dev_get_drvdata(dev->parent);
+ unsigned int ctrl;
+ u16 value;
+ int ret;
+
+ ret = regmap_read(tps->regmap, TPS65910_RTC_CTRL, &ctrl);
+ if (ret < 0)
+ return ret;
+
+ /* If automatic compensation is not enabled report back zero */
+ if (!(ctrl & TPS65910_RTC_CTRL_AUTO_COMP)) {
+ *calibration = 0;
+ return 0;
+ }
+
+ ret = regmap_bulk_read(tps->regmap, TPS65910_RTC_COMP_LSB, comp_data,
+ NUM_COMP_REGS);
+ if (ret < 0) {
+ dev_err(dev, "rtc_get_calibration error: %d\n", ret);
+ return ret;
+ }
+
+ value = (u16)comp_data[0] | ((u16)comp_data[1] << 8);
+
+ *calibration = (s16)value;
+
+ return 0;
+}
+
+static int tps65910_read_offset(struct device *dev, long *offset)
+{
+ int calibration;
+ s64 tmp;
+ int ret;
+
+ ret = tps65910_rtc_get_calibration(dev, &calibration);
+ if (ret < 0)
+ return ret;
+
+ /* Convert from RTC calibration register format to ppb format */
+ tmp = calibration * (s64)PPB_MULT;
+ if (tmp < 0)
+ tmp -= TICKS_PER_HOUR / 2LL;
+ else
+ tmp += TICKS_PER_HOUR / 2LL;
+ tmp = div_s64(tmp, TICKS_PER_HOUR);
+
+ /* Offset value operates in negative way, so swap sign */
+ *offset = (long)-tmp;
+
+ return 0;
+}
+
+static int tps65910_set_offset(struct device *dev, long offset)
+{
+ int calibration;
+ s64 tmp;
+ int ret;
+
+ /* Make sure offset value is within supported range */
+ if (offset < MIN_OFFSET || offset > MAX_OFFSET)
+ return -ERANGE;
+
+ /* Convert from ppb format to RTC calibration register format */
+ tmp = offset * (s64)TICKS_PER_HOUR;
+ if (tmp < 0)
+ tmp -= PPB_MULT / 2LL;
+ else
+ tmp += PPB_MULT / 2LL;
+ tmp = div_s64(tmp, PPB_MULT);
+
+ /* Offset value operates in negative way, so swap sign */
+ calibration = (int)-tmp;
+
+ ret = tps65910_rtc_set_calibration(dev, calibration);
+
+ return ret;
+}
+
static irqreturn_t tps65910_rtc_interrupt(int irq, void *rtc)
{
struct device *dev = rtc;
@@ -219,6 +361,8 @@ static const struct rtc_class_ops tps65910_rtc_ops = {
.read_alarm = tps65910_rtc_read_alarm,
.set_alarm = tps65910_rtc_set_alarm,
.alarm_irq_enable = tps65910_rtc_alarm_irq_enable,
+ .read_offset = tps65910_read_offset,
+ .set_offset = tps65910_set_offset,
};
static int tps65910_rtc_probe(struct platform_device *pdev)
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index 0f1713727d4c..0b38217f8147 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -4864,7 +4864,7 @@ static void dasd_eckd_dump_sense_tcw(struct dasd_device *device,
break;
case 3: /* tsa_intrg */
len += sprintf(page + len, PRINTK_HEADER
- " tsb->tsa.intrg.: not supportet yet\n");
+ " tsb->tsa.intrg.: not supported yet\n");
break;
}
diff --git a/drivers/s390/cio/ioasm.c b/drivers/s390/cio/ioasm.c
index 8225da619014..4182f60124da 100644
--- a/drivers/s390/cio/ioasm.c
+++ b/drivers/s390/cio/ioasm.c
@@ -165,13 +165,15 @@ int tpi(struct tpi_info *addr)
int chsc(void *chsc_area)
{
typedef struct { char _[4096]; } addr_type;
- int cc;
+ int cc = -EIO;
asm volatile(
" .insn rre,0xb25f0000,%2,0\n"
- " ipm %0\n"
+ "0: ipm %0\n"
" srl %0,28\n"
- : "=d" (cc), "=m" (*(addr_type *) chsc_area)
+ "1:\n"
+ EX_TABLE(0b, 1b)
+ : "+d" (cc), "=m" (*(addr_type *) chsc_area)
: "d" (chsc_area), "m" (*(addr_type *) chsc_area)
: "cc");
trace_s390_cio_chsc(chsc_area, cc);
diff --git a/drivers/s390/crypto/Makefile b/drivers/s390/crypto/Makefile
index 0a7fb83f35e5..be36f1010d75 100644
--- a/drivers/s390/crypto/Makefile
+++ b/drivers/s390/crypto/Makefile
@@ -10,3 +10,7 @@ zcrypt-objs += zcrypt_msgtype6.o zcrypt_msgtype50.o
obj-$(CONFIG_ZCRYPT) += zcrypt.o
# adapter drivers depend on ap.o and zcrypt.o
obj-$(CONFIG_ZCRYPT) += zcrypt_pcixcc.o zcrypt_cex2a.o zcrypt_cex4.o
+
+# pkey kernel module
+pkey-objs := pkey_api.o
+obj-$(CONFIG_PKEY) += pkey.o
diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index 56db76c05775..9be4596d8a08 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -1107,16 +1107,6 @@ static void ap_config_timeout(unsigned long ptr)
queue_work(system_long_wq, &ap_scan_work);
}
-static void ap_reset_domain(void)
-{
- int i;
-
- if (ap_domain_index == -1 || !ap_test_config_domain(ap_domain_index))
- return;
- for (i = 0; i < AP_DEVICES; i++)
- ap_rapq(AP_MKQID(i, ap_domain_index));
-}
-
static void ap_reset_all(void)
{
int i, j;
diff --git a/drivers/s390/crypto/ap_card.c b/drivers/s390/crypto/ap_card.c
index 1cd9128593e4..cfa161ccc74e 100644
--- a/drivers/s390/crypto/ap_card.c
+++ b/drivers/s390/crypto/ap_card.c
@@ -58,9 +58,9 @@ static ssize_t ap_functions_show(struct device *dev,
static DEVICE_ATTR(ap_functions, 0444, ap_functions_show, NULL);
-static ssize_t ap_request_count_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
+static ssize_t ap_req_count_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
{
struct ap_card *ac = to_ap_card(dev);
unsigned int req_cnt;
@@ -72,7 +72,23 @@ static ssize_t ap_request_count_show(struct device *dev,
return snprintf(buf, PAGE_SIZE, "%d\n", req_cnt);
}
-static DEVICE_ATTR(request_count, 0444, ap_request_count_show, NULL);
+static ssize_t ap_req_count_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ap_card *ac = to_ap_card(dev);
+ struct ap_queue *aq;
+
+ spin_lock_bh(&ap_list_lock);
+ for_each_ap_queue(aq, ac)
+ aq->total_request_count = 0;
+ spin_unlock_bh(&ap_list_lock);
+ atomic_set(&ac->total_request_count, 0);
+
+ return count;
+}
+
+static DEVICE_ATTR(request_count, 0644, ap_req_count_show, ap_req_count_store);
static ssize_t ap_requestq_count_show(struct device *dev,
struct device_attribute *attr, char *buf)
diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c
index 7be67fa9f224..480c58a63769 100644
--- a/drivers/s390/crypto/ap_queue.c
+++ b/drivers/s390/crypto/ap_queue.c
@@ -459,9 +459,9 @@ EXPORT_SYMBOL(ap_queue_resume);
/*
* AP queue related attributes.
*/
-static ssize_t ap_request_count_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
+static ssize_t ap_req_count_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
{
struct ap_queue *aq = to_ap_queue(dev);
unsigned int req_cnt;
@@ -472,7 +472,20 @@ static ssize_t ap_request_count_show(struct device *dev,
return snprintf(buf, PAGE_SIZE, "%d\n", req_cnt);
}
-static DEVICE_ATTR(request_count, 0444, ap_request_count_show, NULL);
+static ssize_t ap_req_count_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ap_queue *aq = to_ap_queue(dev);
+
+ spin_lock_bh(&aq->lock);
+ aq->total_request_count = 0;
+ spin_unlock_bh(&aq->lock);
+
+ return count;
+}
+
+static DEVICE_ATTR(request_count, 0644, ap_req_count_show, ap_req_count_store);
static ssize_t ap_requestq_count_show(struct device *dev,
struct device_attribute *attr, char *buf)
diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c
new file mode 100644
index 000000000000..40f1136f5568
--- /dev/null
+++ b/drivers/s390/crypto/pkey_api.c
@@ -0,0 +1,1148 @@
+/*
+ * pkey device driver
+ *
+ * Copyright IBM Corp. 2017
+ * Author(s): Harald Freudenberger
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ */
+
+#define KMSG_COMPONENT "pkey"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kallsyms.h>
+#include <linux/debugfs.h>
+#include <asm/zcrypt.h>
+#include <asm/cpacf.h>
+#include <asm/pkey.h>
+
+#include "zcrypt_api.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("IBM Corporation");
+MODULE_DESCRIPTION("s390 protected key interface");
+
+/* Size of parameter block used for all cca requests/replies */
+#define PARMBSIZE 512
+
+/* Size of vardata block used for some of the cca requests/replies */
+#define VARDATASIZE 4096
+
+/*
+ * debug feature data and functions
+ */
+
+static debug_info_t *debug_info;
+
+#define DEBUG_DBG(...) debug_sprintf_event(debug_info, 6, ##__VA_ARGS__)
+#define DEBUG_INFO(...) debug_sprintf_event(debug_info, 5, ##__VA_ARGS__)
+#define DEBUG_WARN(...) debug_sprintf_event(debug_info, 4, ##__VA_ARGS__)
+#define DEBUG_ERR(...) debug_sprintf_event(debug_info, 3, ##__VA_ARGS__)
+
+static void __init pkey_debug_init(void)
+{
+ debug_info = debug_register("pkey", 1, 1, 4 * sizeof(long));
+ debug_register_view(debug_info, &debug_sprintf_view);
+ debug_set_level(debug_info, 3);
+}
+
+static void __exit pkey_debug_exit(void)
+{
+ debug_unregister(debug_info);
+}
+
+/* inside view of a secure key token (only type 0x01 version 0x04) */
+struct secaeskeytoken {
+ u8 type; /* 0x01 for internal key token */
+ u8 res0[3];
+ u8 version; /* should be 0x04 */
+ u8 res1[1];
+ u8 flag; /* key flags */
+ u8 res2[1];
+ u64 mkvp; /* master key verification pattern */
+ u8 key[32]; /* key value (encrypted) */
+ u8 cv[8]; /* control vector */
+ u16 bitsize; /* key bit size */
+ u16 keysize; /* key byte size */
+ u8 tvv[4]; /* token validation value */
+} __packed;
+
+/*
+ * Simple check if the token is a valid CCA secure AES key
+ * token. If keybitsize is given, the bitsize of the key is
+ * also checked. Returns 0 on success or errno value on failure.
+ */
+static int check_secaeskeytoken(u8 *token, int keybitsize)
+{
+ struct secaeskeytoken *t = (struct secaeskeytoken *) token;
+
+ if (t->type != 0x01) {
+ DEBUG_ERR(
+ "check_secaeskeytoken secure token check failed, type mismatch 0x%02x != 0x01\n",
+ (int) t->type);
+ return -EINVAL;
+ }
+ if (t->version != 0x04) {
+ DEBUG_ERR(
+ "check_secaeskeytoken secure token check failed, version mismatch 0x%02x != 0x04\n",
+ (int) t->version);
+ return -EINVAL;
+ }
+ if (keybitsize > 0 && t->bitsize != keybitsize) {
+ DEBUG_ERR(
+ "check_secaeskeytoken secure token check failed, bitsize mismatch %d != %d\n",
+ (int) t->bitsize, keybitsize);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Allocate consecutive memory for request CPRB, request param
+ * block, reply CPRB and reply param block and fill in values
+ * for the common fields. Returns 0 on success or errno value
+ * on failure.
+ */
+static int alloc_and_prep_cprbmem(size_t paramblen,
+ u8 **pcprbmem,
+ struct CPRBX **preqCPRB,
+ struct CPRBX **prepCPRB)
+{
+ u8 *cprbmem;
+ size_t cprbplusparamblen = sizeof(struct CPRBX) + paramblen;
+ struct CPRBX *preqcblk, *prepcblk;
+
+ /*
+ * allocate consecutive memory for request CPRB, request param
+ * block, reply CPRB and reply param block
+ */
+ cprbmem = kmalloc(2 * cprbplusparamblen, GFP_KERNEL);
+ if (!cprbmem)
+ return -ENOMEM;
+ memset(cprbmem, 0, 2 * cprbplusparamblen);
+
+ preqcblk = (struct CPRBX *) cprbmem;
+ prepcblk = (struct CPRBX *) (cprbmem + cprbplusparamblen);
+
+ /* fill request cprb struct */
+ preqcblk->cprb_len = sizeof(struct CPRBX);
+ preqcblk->cprb_ver_id = 0x02;
+ memcpy(preqcblk->func_id, "T2", 2);
+ preqcblk->rpl_msgbl = cprbplusparamblen;
+ if (paramblen) {
+ preqcblk->req_parmb =
+ ((u8 *) preqcblk) + sizeof(struct CPRBX);
+ preqcblk->rpl_parmb =
+ ((u8 *) prepcblk) + sizeof(struct CPRBX);
+ }
+
+ *pcprbmem = cprbmem;
+ *preqCPRB = preqcblk;
+ *prepCPRB = prepcblk;
+
+ return 0;
+}
+
+/*
+ * Free the cprb memory allocated with the function above.
+ * If the scrub value is not zero, the memory is filled
+ * with zeros before freeing (useful if there was some
+ * clear key material in there).
+ */
+static void free_cprbmem(void *mem, size_t paramblen, int scrub)
+{
+ if (scrub)
+ memzero_explicit(mem, 2 * (sizeof(struct CPRBX) + paramblen));
+ kfree(mem);
+}
+
+/*
+ * Helper function to prepare the xcrb struct
+ */
+static inline void prep_xcrb(struct ica_xcRB *pxcrb,
+ u16 cardnr,
+ struct CPRBX *preqcblk,
+ struct CPRBX *prepcblk)
+{
+ memset(pxcrb, 0, sizeof(*pxcrb));
+ pxcrb->agent_ID = 0x4341; /* 'CA' */
+ pxcrb->user_defined = (cardnr == 0xFFFF ? AUTOSELECT : cardnr);
+ pxcrb->request_control_blk_length =
+ preqcblk->cprb_len + preqcblk->req_parml;
+ pxcrb->request_control_blk_addr = (void *) preqcblk;
+ pxcrb->reply_control_blk_length = preqcblk->rpl_msgbl;
+ pxcrb->reply_control_blk_addr = (void *) prepcblk;
+}
+
+/*
+ * Helper function which calls zcrypt_send_cprb with
+ * memory management segment adjusted to kernel space
+ * so that the copy_from_user called within this
+ * function do in fact copy from kernel space.
+ */
+static inline int _zcrypt_send_cprb(struct ica_xcRB *xcrb)
+{
+ int rc;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ rc = zcrypt_send_cprb(xcrb);
+ set_fs(old_fs);
+
+ return rc;
+}
+
+/*
+ * Generate (random) AES secure key.
+ */
+int pkey_genseckey(u16 cardnr, u16 domain,
+ u32 keytype, struct pkey_seckey *seckey)
+{
+ int i, rc, keysize;
+ int seckeysize;
+ u8 *mem;
+ struct CPRBX *preqcblk, *prepcblk;
+ struct ica_xcRB xcrb;
+ struct kgreqparm {
+ u8 subfunc_code[2];
+ u16 rule_array_len;
+ struct lv1 {
+ u16 len;
+ char key_form[8];
+ char key_length[8];
+ char key_type1[8];
+ char key_type2[8];
+ } lv1;
+ struct lv2 {
+ u16 len;
+ struct keyid {
+ u16 len;
+ u16 attr;
+ u8 data[SECKEYBLOBSIZE];
+ } keyid[6];
+ } lv2;
+ } *preqparm;
+ struct kgrepparm {
+ u8 subfunc_code[2];
+ u16 rule_array_len;
+ struct lv3 {
+ u16 len;
+ u16 keyblocklen;
+ struct {
+ u16 toklen;
+ u16 tokattr;
+ u8 tok[0];
+ /* ... some more data ... */
+ } keyblock;
+ } lv3;
+ } *prepparm;
+
+ /* get already prepared memory for 2 cprbs with param block each */
+ rc = alloc_and_prep_cprbmem(PARMBSIZE, &mem, &preqcblk, &prepcblk);
+ if (rc)
+ return rc;
+
+ /* fill request cprb struct */
+ preqcblk->domain = domain;
+
+ /* fill request cprb param block with KG request */
+ preqparm = (struct kgreqparm *) preqcblk->req_parmb;
+ memcpy(preqparm->subfunc_code, "KG", 2);
+ preqparm->rule_array_len = sizeof(preqparm->rule_array_len);
+ preqparm->lv1.len = sizeof(struct lv1);
+ memcpy(preqparm->lv1.key_form, "OP ", 8);
+ switch (keytype) {
+ case PKEY_KEYTYPE_AES_128:
+ keysize = 16;
+ memcpy(preqparm->lv1.key_length, "KEYLN16 ", 8);
+ break;
+ case PKEY_KEYTYPE_AES_192:
+ keysize = 24;
+ memcpy(preqparm->lv1.key_length, "KEYLN24 ", 8);
+ break;
+ case PKEY_KEYTYPE_AES_256:
+ keysize = 32;
+ memcpy(preqparm->lv1.key_length, "KEYLN32 ", 8);
+ break;
+ default:
+ DEBUG_ERR(
+ "pkey_genseckey unknown/unsupported keytype %d\n",
+ keytype);
+ rc = -EINVAL;
+ goto out;
+ }
+ memcpy(preqparm->lv1.key_type1, "AESDATA ", 8);
+ preqparm->lv2.len = sizeof(struct lv2);
+ for (i = 0; i < 6; i++) {
+ preqparm->lv2.keyid[i].len = sizeof(struct keyid);
+ preqparm->lv2.keyid[i].attr = (i == 2 ? 0x30 : 0x10);
+ }
+ preqcblk->req_parml = sizeof(struct kgreqparm);
+
+ /* fill xcrb struct */
+ prep_xcrb(&xcrb, cardnr, preqcblk, prepcblk);
+
+ /* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
+ rc = _zcrypt_send_cprb(&xcrb);
+ if (rc) {
+ DEBUG_ERR(
+ "pkey_genseckey zcrypt_send_cprb (cardnr=%d domain=%d) failed with errno %d\n",
+ (int) cardnr, (int) domain, rc);
+ goto out;
+ }
+
+ /* check response returncode and reasoncode */
+ if (prepcblk->ccp_rtcode != 0) {
+ DEBUG_ERR(
+ "pkey_genseckey secure key generate failure, card response %d/%d\n",
+ (int) prepcblk->ccp_rtcode,
+ (int) prepcblk->ccp_rscode);
+ rc = -EIO;
+ goto out;
+ }
+
+ /* process response cprb param block */
+ prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
+ prepparm = (struct kgrepparm *) prepcblk->rpl_parmb;
+
+ /* check length of the returned secure key token */
+ seckeysize = prepparm->lv3.keyblock.toklen
+ - sizeof(prepparm->lv3.keyblock.toklen)
+ - sizeof(prepparm->lv3.keyblock.tokattr);
+ if (seckeysize != SECKEYBLOBSIZE) {
+ DEBUG_ERR(
+ "pkey_genseckey secure token size mismatch %d != %d bytes\n",
+ seckeysize, SECKEYBLOBSIZE);
+ rc = -EIO;
+ goto out;
+ }
+
+ /* check secure key token */
+ rc = check_secaeskeytoken(prepparm->lv3.keyblock.tok, 8*keysize);
+ if (rc) {
+ rc = -EIO;
+ goto out;
+ }
+
+ /* copy the generated secure key token */
+ memcpy(seckey->seckey, prepparm->lv3.keyblock.tok, SECKEYBLOBSIZE);
+
+out:
+ free_cprbmem(mem, PARMBSIZE, 0);
+ return rc;
+}
+EXPORT_SYMBOL(pkey_genseckey);
+
+/*
+ * Generate an AES secure key with given key value.
+ */
+int pkey_clr2seckey(u16 cardnr, u16 domain, u32 keytype,
+ const struct pkey_clrkey *clrkey,
+ struct pkey_seckey *seckey)
+{
+ int rc, keysize, seckeysize;
+ u8 *mem;
+ struct CPRBX *preqcblk, *prepcblk;
+ struct ica_xcRB xcrb;
+ struct cmreqparm {
+ u8 subfunc_code[2];
+ u16 rule_array_len;
+ char rule_array[8];
+ struct lv1 {
+ u16 len;
+ u8 clrkey[0];
+ } lv1;
+ struct lv2 {
+ u16 len;
+ struct keyid {
+ u16 len;
+ u16 attr;
+ u8 data[SECKEYBLOBSIZE];
+ } keyid;
+ } lv2;
+ } *preqparm;
+ struct lv2 *plv2;
+ struct cmrepparm {
+ u8 subfunc_code[2];
+ u16 rule_array_len;
+ struct lv3 {
+ u16 len;
+ u16 keyblocklen;
+ struct {
+ u16 toklen;
+ u16 tokattr;
+ u8 tok[0];
+ /* ... some more data ... */
+ } keyblock;
+ } lv3;
+ } *prepparm;
+
+ /* get already prepared memory for 2 cprbs with param block each */
+ rc = alloc_and_prep_cprbmem(PARMBSIZE, &mem, &preqcblk, &prepcblk);
+ if (rc)
+ return rc;
+
+ /* fill request cprb struct */
+ preqcblk->domain = domain;
+
+ /* fill request cprb param block with CM request */
+ preqparm = (struct cmreqparm *) preqcblk->req_parmb;
+ memcpy(preqparm->subfunc_code, "CM", 2);
+ memcpy(preqparm->rule_array, "AES ", 8);
+ preqparm->rule_array_len =
+ sizeof(preqparm->rule_array_len) + sizeof(preqparm->rule_array);
+ switch (keytype) {
+ case PKEY_KEYTYPE_AES_128:
+ keysize = 16;
+ break;
+ case PKEY_KEYTYPE_AES_192:
+ keysize = 24;
+ break;
+ case PKEY_KEYTYPE_AES_256:
+ keysize = 32;
+ break;
+ default:
+ DEBUG_ERR(
+ "pkey_clr2seckey unknown/unsupported keytype %d\n",
+ keytype);
+ rc = -EINVAL;
+ goto out;
+ }
+ preqparm->lv1.len = sizeof(struct lv1) + keysize;
+ memcpy(preqparm->lv1.clrkey, clrkey->clrkey, keysize);
+ plv2 = (struct lv2 *) (((u8 *) &preqparm->lv2) + keysize);
+ plv2->len = sizeof(struct lv2);
+ plv2->keyid.len = sizeof(struct keyid);
+ plv2->keyid.attr = 0x30;
+ preqcblk->req_parml = sizeof(struct cmreqparm) + keysize;
+
+ /* fill xcrb struct */
+ prep_xcrb(&xcrb, cardnr, preqcblk, prepcblk);
+
+ /* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
+ rc = _zcrypt_send_cprb(&xcrb);
+ if (rc) {
+ DEBUG_ERR(
+ "pkey_clr2seckey zcrypt_send_cprb (cardnr=%d domain=%d) failed with errno %d\n",
+ (int) cardnr, (int) domain, rc);
+ goto out;
+ }
+
+ /* check response returncode and reasoncode */
+ if (prepcblk->ccp_rtcode != 0) {
+ DEBUG_ERR(
+ "pkey_clr2seckey clear key import failure, card response %d/%d\n",
+ (int) prepcblk->ccp_rtcode,
+ (int) prepcblk->ccp_rscode);
+ rc = -EIO;
+ goto out;
+ }
+
+ /* process response cprb param block */
+ prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
+ prepparm = (struct cmrepparm *) prepcblk->rpl_parmb;
+
+ /* check length of the returned secure key token */
+ seckeysize = prepparm->lv3.keyblock.toklen
+ - sizeof(prepparm->lv3.keyblock.toklen)
+ - sizeof(prepparm->lv3.keyblock.tokattr);
+ if (seckeysize != SECKEYBLOBSIZE) {
+ DEBUG_ERR(
+ "pkey_clr2seckey secure token size mismatch %d != %d bytes\n",
+ seckeysize, SECKEYBLOBSIZE);
+ rc = -EIO;
+ goto out;
+ }
+
+ /* check secure key token */
+ rc = check_secaeskeytoken(prepparm->lv3.keyblock.tok, 8*keysize);
+ if (rc) {
+ rc = -EIO;
+ goto out;
+ }
+
+ /* copy the generated secure key token */
+ memcpy(seckey->seckey, prepparm->lv3.keyblock.tok, SECKEYBLOBSIZE);
+
+out:
+ free_cprbmem(mem, PARMBSIZE, 1);
+ return rc;
+}
+EXPORT_SYMBOL(pkey_clr2seckey);
+
+/*
+ * Derive a proteced key from the secure key blob.
+ */
+int pkey_sec2protkey(u16 cardnr, u16 domain,
+ const struct pkey_seckey *seckey,
+ struct pkey_protkey *protkey)
+{
+ int rc;
+ u8 *mem;
+ struct CPRBX *preqcblk, *prepcblk;
+ struct ica_xcRB xcrb;
+ struct uskreqparm {
+ u8 subfunc_code[2];
+ u16 rule_array_len;
+ struct lv1 {
+ u16 len;
+ u16 attr_len;
+ u16 attr_flags;
+ } lv1;
+ struct lv2 {
+ u16 len;
+ u16 attr_len;
+ u16 attr_flags;
+ u8 token[0]; /* cca secure key token */
+ } lv2 __packed;
+ } *preqparm;
+ struct uskrepparm {
+ u8 subfunc_code[2];
+ u16 rule_array_len;
+ struct lv3 {
+ u16 len;
+ u16 attr_len;
+ u16 attr_flags;
+ struct cpacfkeyblock {
+ u8 version; /* version of this struct */
+ u8 flags[2];
+ u8 algo;
+ u8 form;
+ u8 pad1[3];
+ u16 keylen;
+ u8 key[64]; /* the key (keylen bytes) */
+ u16 keyattrlen;
+ u8 keyattr[32];
+ u8 pad2[1];
+ u8 vptype;
+ u8 vp[32]; /* verification pattern */
+ } keyblock;
+ } lv3 __packed;
+ } *prepparm;
+
+ /* get already prepared memory for 2 cprbs with param block each */
+ rc = alloc_and_prep_cprbmem(PARMBSIZE, &mem, &preqcblk, &prepcblk);
+ if (rc)
+ return rc;
+
+ /* fill request cprb struct */
+ preqcblk->domain = domain;
+
+ /* fill request cprb param block with USK request */
+ preqparm = (struct uskreqparm *) preqcblk->req_parmb;
+ memcpy(preqparm->subfunc_code, "US", 2);
+ preqparm->rule_array_len = sizeof(preqparm->rule_array_len);
+ preqparm->lv1.len = sizeof(struct lv1);
+ preqparm->lv1.attr_len = sizeof(struct lv1) - sizeof(preqparm->lv1.len);
+ preqparm->lv1.attr_flags = 0x0001;
+ preqparm->lv2.len = sizeof(struct lv2) + SECKEYBLOBSIZE;
+ preqparm->lv2.attr_len = sizeof(struct lv2)
+ - sizeof(preqparm->lv2.len) + SECKEYBLOBSIZE;
+ preqparm->lv2.attr_flags = 0x0000;
+ memcpy(preqparm->lv2.token, seckey->seckey, SECKEYBLOBSIZE);
+ preqcblk->req_parml = sizeof(struct uskreqparm) + SECKEYBLOBSIZE;
+
+ /* fill xcrb struct */
+ prep_xcrb(&xcrb, cardnr, preqcblk, prepcblk);
+
+ /* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
+ rc = _zcrypt_send_cprb(&xcrb);
+ if (rc) {
+ DEBUG_ERR(
+ "pkey_sec2protkey zcrypt_send_cprb (cardnr=%d domain=%d) failed with errno %d\n",
+ (int) cardnr, (int) domain, rc);
+ goto out;
+ }
+
+ /* check response returncode and reasoncode */
+ if (prepcblk->ccp_rtcode != 0) {
+ DEBUG_ERR(
+ "pkey_sec2protkey unwrap secure key failure, card response %d/%d\n",
+ (int) prepcblk->ccp_rtcode,
+ (int) prepcblk->ccp_rscode);
+ rc = -EIO;
+ goto out;
+ }
+
+ /* process response cprb param block */
+ prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
+ prepparm = (struct uskrepparm *) prepcblk->rpl_parmb;
+
+ /* check the returned keyblock */
+ if (prepparm->lv3.keyblock.version != 0x01) {
+ DEBUG_ERR(
+ "pkey_sec2protkey reply param keyblock version mismatch 0x%02x != 0x01\n",
+ (int) prepparm->lv3.keyblock.version);
+ rc = -EIO;
+ goto out;
+ }
+
+ /* copy the tanslated protected key */
+ switch (prepparm->lv3.keyblock.keylen) {
+ case 16+32:
+ protkey->type = PKEY_KEYTYPE_AES_128;
+ break;
+ case 24+32:
+ protkey->type = PKEY_KEYTYPE_AES_192;
+ break;
+ case 32+32:
+ protkey->type = PKEY_KEYTYPE_AES_256;
+ break;
+ default:
+ DEBUG_ERR("pkey_sec2protkey unknown/unsupported keytype %d\n",
+ prepparm->lv3.keyblock.keylen);
+ rc = -EIO;
+ goto out;
+ }
+ protkey->len = prepparm->lv3.keyblock.keylen;
+ memcpy(protkey->protkey, prepparm->lv3.keyblock.key, protkey->len);
+
+out:
+ free_cprbmem(mem, PARMBSIZE, 0);
+ return rc;
+}
+EXPORT_SYMBOL(pkey_sec2protkey);
+
+/*
+ * Create a protected key from a clear key value.
+ */
+int pkey_clr2protkey(u32 keytype,
+ const struct pkey_clrkey *clrkey,
+ struct pkey_protkey *protkey)
+{
+ long fc;
+ int keysize;
+ u8 paramblock[64];
+
+ switch (keytype) {
+ case PKEY_KEYTYPE_AES_128:
+ keysize = 16;
+ fc = CPACF_PCKMO_ENC_AES_128_KEY;
+ break;
+ case PKEY_KEYTYPE_AES_192:
+ keysize = 24;
+ fc = CPACF_PCKMO_ENC_AES_192_KEY;
+ break;
+ case PKEY_KEYTYPE_AES_256:
+ keysize = 32;
+ fc = CPACF_PCKMO_ENC_AES_256_KEY;
+ break;
+ default:
+ DEBUG_ERR("pkey_clr2protkey unknown/unsupported keytype %d\n",
+ keytype);
+ return -EINVAL;
+ }
+
+ /* prepare param block */
+ memset(paramblock, 0, sizeof(paramblock));
+ memcpy(paramblock, clrkey->clrkey, keysize);
+
+ /* call the pckmo instruction */
+ cpacf_pckmo(fc, paramblock);
+
+ /* copy created protected key */
+ protkey->type = keytype;
+ protkey->len = keysize + 32;
+ memcpy(protkey->protkey, paramblock, keysize + 32);
+
+ return 0;
+}
+EXPORT_SYMBOL(pkey_clr2protkey);
+
+/*
+ * query cryptographic facility from adapter
+ */
+static int query_crypto_facility(u16 cardnr, u16 domain,
+ const char *keyword,
+ u8 *rarray, size_t *rarraylen,
+ u8 *varray, size_t *varraylen)
+{
+ int rc;
+ u16 len;
+ u8 *mem, *ptr;
+ struct CPRBX *preqcblk, *prepcblk;
+ struct ica_xcRB xcrb;
+ struct fqreqparm {
+ u8 subfunc_code[2];
+ u16 rule_array_len;
+ char rule_array[8];
+ struct lv1 {
+ u16 len;
+ u8 data[VARDATASIZE];
+ } lv1;
+ u16 dummylen;
+ } *preqparm;
+ size_t parmbsize = sizeof(struct fqreqparm);
+ struct fqrepparm {
+ u8 subfunc_code[2];
+ u8 lvdata[0];
+ } *prepparm;
+
+ /* get already prepared memory for 2 cprbs with param block each */
+ rc = alloc_and_prep_cprbmem(parmbsize, &mem, &preqcblk, &prepcblk);
+ if (rc)
+ return rc;
+
+ /* fill request cprb struct */
+ preqcblk->domain = domain;
+
+ /* fill request cprb param block with FQ request */
+ preqparm = (struct fqreqparm *) preqcblk->req_parmb;
+ memcpy(preqparm->subfunc_code, "FQ", 2);
+ strncpy(preqparm->rule_array, keyword, sizeof(preqparm->rule_array));
+ preqparm->rule_array_len =
+ sizeof(preqparm->rule_array_len) + sizeof(preqparm->rule_array);
+ preqparm->lv1.len = sizeof(preqparm->lv1);
+ preqparm->dummylen = sizeof(preqparm->dummylen);
+ preqcblk->req_parml = parmbsize;
+
+ /* fill xcrb struct */
+ prep_xcrb(&xcrb, cardnr, preqcblk, prepcblk);
+
+ /* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
+ rc = _zcrypt_send_cprb(&xcrb);
+ if (rc) {
+ DEBUG_ERR(
+ "query_crypto_facility zcrypt_send_cprb (cardnr=%d domain=%d) failed with errno %d\n",
+ (int) cardnr, (int) domain, rc);
+ goto out;
+ }
+
+ /* check response returncode and reasoncode */
+ if (prepcblk->ccp_rtcode != 0) {
+ DEBUG_ERR(
+ "query_crypto_facility unwrap secure key failure, card response %d/%d\n",
+ (int) prepcblk->ccp_rtcode,
+ (int) prepcblk->ccp_rscode);
+ rc = -EIO;
+ goto out;
+ }
+
+ /* process response cprb param block */
+ prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
+ prepparm = (struct fqrepparm *) prepcblk->rpl_parmb;
+ ptr = prepparm->lvdata;
+
+ /* check and possibly copy reply rule array */
+ len = *((u16 *) ptr);
+ if (len > sizeof(u16)) {
+ ptr += sizeof(u16);
+ len -= sizeof(u16);
+ if (rarray && rarraylen && *rarraylen > 0) {
+ *rarraylen = (len > *rarraylen ? *rarraylen : len);
+ memcpy(rarray, ptr, *rarraylen);
+ }
+ ptr += len;
+ }
+ /* check and possible copy reply var array */
+ len = *((u16 *) ptr);
+ if (len > sizeof(u16)) {
+ ptr += sizeof(u16);
+ len -= sizeof(u16);
+ if (varray && varraylen && *varraylen > 0) {
+ *varraylen = (len > *varraylen ? *varraylen : len);
+ memcpy(varray, ptr, *varraylen);
+ }
+ ptr += len;
+ }
+
+out:
+ free_cprbmem(mem, parmbsize, 0);
+ return rc;
+}
+
+/*
+ * Fetch just the mkvp value via query_crypto_facility from adapter.
+ */
+static int fetch_mkvp(u16 cardnr, u16 domain, u64 *mkvp)
+{
+ int rc, found = 0;
+ size_t rlen, vlen;
+ u8 *rarray, *varray, *pg;
+
+ pg = (u8 *) __get_free_page(GFP_KERNEL);
+ if (!pg)
+ return -ENOMEM;
+ rarray = pg;
+ varray = pg + PAGE_SIZE/2;
+ rlen = vlen = PAGE_SIZE/2;
+
+ rc = query_crypto_facility(cardnr, domain, "STATICSA",
+ rarray, &rlen, varray, &vlen);
+ if (rc == 0 && rlen > 8*8 && vlen > 184+8) {
+ if (rarray[64] == '2') {
+ /* current master key state is valid */
+ *mkvp = *((u64 *)(varray + 184));
+ found = 1;
+ }
+ }
+
+ free_page((unsigned long) pg);
+
+ return found ? 0 : -ENOENT;
+}
+
+/* struct to hold cached mkvp info for each card/domain */
+struct mkvp_info {
+ struct list_head list;
+ u16 cardnr;
+ u16 domain;
+ u64 mkvp;
+};
+
+/* a list with mkvp_info entries */
+static LIST_HEAD(mkvp_list);
+static DEFINE_SPINLOCK(mkvp_list_lock);
+
+static int mkvp_cache_fetch(u16 cardnr, u16 domain, u64 *mkvp)
+{
+ int rc = -ENOENT;
+ struct mkvp_info *ptr;
+
+ spin_lock_bh(&mkvp_list_lock);
+ list_for_each_entry(ptr, &mkvp_list, list) {
+ if (ptr->cardnr == cardnr &&
+ ptr->domain == domain) {
+ *mkvp = ptr->mkvp;
+ rc = 0;
+ break;
+ }
+ }
+ spin_unlock_bh(&mkvp_list_lock);
+
+ return rc;
+}
+
+static void mkvp_cache_update(u16 cardnr, u16 domain, u64 mkvp)
+{
+ int found = 0;
+ struct mkvp_info *ptr;
+
+ spin_lock_bh(&mkvp_list_lock);
+ list_for_each_entry(ptr, &mkvp_list, list) {
+ if (ptr->cardnr == cardnr &&
+ ptr->domain == domain) {
+ ptr->mkvp = mkvp;
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ ptr = kmalloc(sizeof(*ptr), GFP_ATOMIC);
+ if (!ptr) {
+ spin_unlock_bh(&mkvp_list_lock);
+ return;
+ }
+ ptr->cardnr = cardnr;
+ ptr->domain = domain;
+ ptr->mkvp = mkvp;
+ list_add(&ptr->list, &mkvp_list);
+ }
+ spin_unlock_bh(&mkvp_list_lock);
+}
+
+static void mkvp_cache_scrub(u16 cardnr, u16 domain)
+{
+ struct mkvp_info *ptr;
+
+ spin_lock_bh(&mkvp_list_lock);
+ list_for_each_entry(ptr, &mkvp_list, list) {
+ if (ptr->cardnr == cardnr &&
+ ptr->domain == domain) {
+ list_del(&ptr->list);
+ kfree(ptr);
+ break;
+ }
+ }
+ spin_unlock_bh(&mkvp_list_lock);
+}
+
+static void __exit mkvp_cache_free(void)
+{
+ struct mkvp_info *ptr, *pnext;
+
+ spin_lock_bh(&mkvp_list_lock);
+ list_for_each_entry_safe(ptr, pnext, &mkvp_list, list) {
+ list_del(&ptr->list);
+ kfree(ptr);
+ }
+ spin_unlock_bh(&mkvp_list_lock);
+}
+
+/*
+ * Search for a matching crypto card based on the Master Key
+ * Verification Pattern provided inside a secure key.
+ */
+int pkey_findcard(const struct pkey_seckey *seckey,
+ u16 *pcardnr, u16 *pdomain, int verify)
+{
+ struct secaeskeytoken *t = (struct secaeskeytoken *) seckey;
+ struct zcrypt_device_matrix *device_matrix;
+ u16 card, dom;
+ u64 mkvp;
+ int i, rc;
+
+ /* mkvp must not be zero */
+ if (t->mkvp == 0)
+ return -EINVAL;
+
+ /* fetch status of all crypto cards */
+ device_matrix = kmalloc(sizeof(struct zcrypt_device_matrix),
+ GFP_KERNEL);
+ if (!device_matrix)
+ return -ENOMEM;
+ zcrypt_device_status_mask(device_matrix);
+
+ /* walk through all crypto cards */
+ for (i = 0; i < MAX_ZDEV_ENTRIES; i++) {
+ card = AP_QID_CARD(device_matrix->device[i].qid);
+ dom = AP_QID_QUEUE(device_matrix->device[i].qid);
+ if (device_matrix->device[i].online &&
+ device_matrix->device[i].functions & 0x04) {
+ /* an enabled CCA Coprocessor card */
+ /* try cached mkvp */
+ if (mkvp_cache_fetch(card, dom, &mkvp) == 0 &&
+ t->mkvp == mkvp) {
+ if (!verify)
+ break;
+ /* verify: fetch mkvp from adapter */
+ if (fetch_mkvp(card, dom, &mkvp) == 0) {
+ mkvp_cache_update(card, dom, mkvp);
+ if (t->mkvp == mkvp)
+ break;
+ }
+ }
+ } else {
+ /* Card is offline and/or not a CCA card. */
+ /* del mkvp entry from cache if it exists */
+ mkvp_cache_scrub(card, dom);
+ }
+ }
+ if (i >= MAX_ZDEV_ENTRIES) {
+ /* nothing found, so this time without cache */
+ for (i = 0; i < MAX_ZDEV_ENTRIES; i++) {
+ if (!(device_matrix->device[i].online &&
+ device_matrix->device[i].functions & 0x04))
+ continue;
+ card = AP_QID_CARD(device_matrix->device[i].qid);
+ dom = AP_QID_QUEUE(device_matrix->device[i].qid);
+ /* fresh fetch mkvp from adapter */
+ if (fetch_mkvp(card, dom, &mkvp) == 0) {
+ mkvp_cache_update(card, dom, mkvp);
+ if (t->mkvp == mkvp)
+ break;
+ }
+ }
+ }
+ if (i < MAX_ZDEV_ENTRIES) {
+ if (pcardnr)
+ *pcardnr = card;
+ if (pdomain)
+ *pdomain = dom;
+ rc = 0;
+ } else
+ rc = -ENODEV;
+
+ kfree(device_matrix);
+ return rc;
+}
+EXPORT_SYMBOL(pkey_findcard);
+
+/*
+ * Find card and transform secure key into protected key.
+ */
+int pkey_skey2pkey(const struct pkey_seckey *seckey,
+ struct pkey_protkey *protkey)
+{
+ u16 cardnr, domain;
+ int rc, verify;
+
+ /*
+ * The pkey_sec2protkey call may fail when a card has been
+ * addressed where the master key was changed after last fetch
+ * of the mkvp into the cache. So first try without verify then
+ * with verify enabled (thus refreshing the mkvp for each card).
+ */
+ for (verify = 0; verify < 2; verify++) {
+ rc = pkey_findcard(seckey, &cardnr, &domain, verify);
+ if (rc)
+ continue;
+ rc = pkey_sec2protkey(cardnr, domain, seckey, protkey);
+ if (rc == 0)
+ break;
+ }
+
+ if (rc)
+ DEBUG_DBG("pkey_skey2pkey failed rc=%d\n", rc);
+
+ return rc;
+}
+EXPORT_SYMBOL(pkey_skey2pkey);
+
+/*
+ * File io functions
+ */
+
+static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ int rc;
+
+ switch (cmd) {
+ case PKEY_GENSECK: {
+ struct pkey_genseck __user *ugs = (void __user *) arg;
+ struct pkey_genseck kgs;
+
+ if (copy_from_user(&kgs, ugs, sizeof(kgs)))
+ return -EFAULT;
+ rc = pkey_genseckey(kgs.cardnr, kgs.domain,
+ kgs.keytype, &kgs.seckey);
+ DEBUG_DBG("pkey_ioctl pkey_genseckey()=%d\n", rc);
+ if (rc)
+ break;
+ if (copy_to_user(ugs, &kgs, sizeof(kgs)))
+ return -EFAULT;
+ break;
+ }
+ case PKEY_CLR2SECK: {
+ struct pkey_clr2seck __user *ucs = (void __user *) arg;
+ struct pkey_clr2seck kcs;
+
+ if (copy_from_user(&kcs, ucs, sizeof(kcs)))
+ return -EFAULT;
+ rc = pkey_clr2seckey(kcs.cardnr, kcs.domain, kcs.keytype,
+ &kcs.clrkey, &kcs.seckey);
+ DEBUG_DBG("pkey_ioctl pkey_clr2seckey()=%d\n", rc);
+ if (rc)
+ break;
+ if (copy_to_user(ucs, &kcs, sizeof(kcs)))
+ return -EFAULT;
+ memzero_explicit(&kcs, sizeof(kcs));
+ break;
+ }
+ case PKEY_SEC2PROTK: {
+ struct pkey_sec2protk __user *usp = (void __user *) arg;
+ struct pkey_sec2protk ksp;
+
+ if (copy_from_user(&ksp, usp, sizeof(ksp)))
+ return -EFAULT;
+ rc = pkey_sec2protkey(ksp.cardnr, ksp.domain,
+ &ksp.seckey, &ksp.protkey);
+ DEBUG_DBG("pkey_ioctl pkey_sec2protkey()=%d\n", rc);
+ if (rc)
+ break;
+ if (copy_to_user(usp, &ksp, sizeof(ksp)))
+ return -EFAULT;
+ break;
+ }
+ case PKEY_CLR2PROTK: {
+ struct pkey_clr2protk __user *ucp = (void __user *) arg;
+ struct pkey_clr2protk kcp;
+
+ if (copy_from_user(&kcp, ucp, sizeof(kcp)))
+ return -EFAULT;
+ rc = pkey_clr2protkey(kcp.keytype,
+ &kcp.clrkey, &kcp.protkey);
+ DEBUG_DBG("pkey_ioctl pkey_clr2protkey()=%d\n", rc);
+ if (rc)
+ break;
+ if (copy_to_user(ucp, &kcp, sizeof(kcp)))
+ return -EFAULT;
+ memzero_explicit(&kcp, sizeof(kcp));
+ break;
+ }
+ case PKEY_FINDCARD: {
+ struct pkey_findcard __user *ufc = (void __user *) arg;
+ struct pkey_findcard kfc;
+
+ if (copy_from_user(&kfc, ufc, sizeof(kfc)))
+ return -EFAULT;
+ rc = pkey_findcard(&kfc.seckey,
+ &kfc.cardnr, &kfc.domain, 1);
+ DEBUG_DBG("pkey_ioctl pkey_findcard()=%d\n", rc);
+ if (rc)
+ break;
+ if (copy_to_user(ufc, &kfc, sizeof(kfc)))
+ return -EFAULT;
+ break;
+ }
+ case PKEY_SKEY2PKEY: {
+ struct pkey_skey2pkey __user *usp = (void __user *) arg;
+ struct pkey_skey2pkey ksp;
+
+ if (copy_from_user(&ksp, usp, sizeof(ksp)))
+ return -EFAULT;
+ rc = pkey_skey2pkey(&ksp.seckey, &ksp.protkey);
+ DEBUG_DBG("pkey_ioctl pkey_skey2pkey()=%d\n", rc);
+ if (rc)
+ break;
+ if (copy_to_user(usp, &ksp, sizeof(ksp)))
+ return -EFAULT;
+ break;
+ }
+ default:
+ /* unknown/unsupported ioctl cmd */
+ return -ENOTTY;
+ }
+
+ return rc;
+}
+
+/*
+ * Sysfs and file io operations
+ */
+static const struct file_operations pkey_fops = {
+ .owner = THIS_MODULE,
+ .open = nonseekable_open,
+ .llseek = no_llseek,
+ .unlocked_ioctl = pkey_unlocked_ioctl,
+};
+
+static struct miscdevice pkey_dev = {
+ .name = "pkey",
+ .minor = MISC_DYNAMIC_MINOR,
+ .mode = 0666,
+ .fops = &pkey_fops,
+};
+
+/*
+ * Module init
+ */
+int __init pkey_init(void)
+{
+ cpacf_mask_t pckmo_functions;
+
+ /* check for pckmo instructions available */
+ if (!cpacf_query(CPACF_PCKMO, &pckmo_functions))
+ return -EOPNOTSUPP;
+ if (!cpacf_test_func(&pckmo_functions, CPACF_PCKMO_ENC_AES_128_KEY) ||
+ !cpacf_test_func(&pckmo_functions, CPACF_PCKMO_ENC_AES_192_KEY) ||
+ !cpacf_test_func(&pckmo_functions, CPACF_PCKMO_ENC_AES_256_KEY))
+ return -EOPNOTSUPP;
+
+ pkey_debug_init();
+
+ return misc_register(&pkey_dev);
+}
+
+/*
+ * Module exit
+ */
+static void __exit pkey_exit(void)
+{
+ misc_deregister(&pkey_dev);
+ mkvp_cache_free();
+ pkey_debug_exit();
+}
+
+module_init(pkey_init);
+module_exit(pkey_exit);
diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index 144a17941e6f..93015f85d4a6 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -374,7 +374,7 @@ out:
return rc;
}
-static long zcrypt_send_cprb(struct ica_xcRB *xcRB)
+long zcrypt_send_cprb(struct ica_xcRB *xcRB)
{
struct zcrypt_card *zc, *pref_zc;
struct zcrypt_queue *zq, *pref_zq;
@@ -444,6 +444,7 @@ out:
AP_QID_CARD(qid), AP_QID_QUEUE(qid));
return rc;
}
+EXPORT_SYMBOL(zcrypt_send_cprb);
static bool is_desired_ep11_card(unsigned int dev_id,
unsigned short target_num,
@@ -619,7 +620,7 @@ out:
return rc;
}
-static void zcrypt_device_status_mask(struct zcrypt_device_matrix *matrix)
+void zcrypt_device_status_mask(struct zcrypt_device_matrix *matrix)
{
struct zcrypt_card *zc;
struct zcrypt_queue *zq;
diff --git a/drivers/s390/crypto/zcrypt_api.h b/drivers/s390/crypto/zcrypt_api.h
index 274a59051534..6c94efd23eac 100644
--- a/drivers/s390/crypto/zcrypt_api.h
+++ b/drivers/s390/crypto/zcrypt_api.h
@@ -190,5 +190,7 @@ void zcrypt_msgtype_unregister(struct zcrypt_ops *);
struct zcrypt_ops *zcrypt_msgtype(unsigned char *, int);
int zcrypt_api_init(void);
void zcrypt_api_exit(void);
+long zcrypt_send_cprb(struct ica_xcRB *xcRB);
+void zcrypt_device_status_mask(struct zcrypt_device_matrix *devstatus);
#endif /* _ZCRYPT_API_H_ */
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 439b946c4808..db5900aaa55a 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -478,7 +478,7 @@ static void kernfs_drain(struct kernfs_node *kn)
rwsem_release(&kn->dep_map, 1, _RET_IP_);
}
- kernfs_unmap_bin_file(kn);
+ kernfs_drain_open_files(kn);
mutex_lock(&kernfs_mutex);
}
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 4f0535890b30..35043a8c4529 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -515,7 +515,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
goto out_put;
rc = 0;
- of->mmapped = 1;
+ of->mmapped = true;
of->vm_ops = vma->vm_ops;
vma->vm_ops = &kernfs_vm_ops;
out_put:
@@ -707,7 +707,8 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
if (error)
goto err_free;
- ((struct seq_file *)file->private_data)->private = of;
+ of->seq_file = file->private_data;
+ of->seq_file->private = of;
/* seq_file clears PWRITE unconditionally, restore it if WRITE */
if (file->f_mode & FMODE_WRITE)
@@ -716,13 +717,22 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
/* make sure we have open node struct */
error = kernfs_get_open_node(kn, of);
if (error)
- goto err_close;
+ goto err_seq_release;
+
+ if (ops->open) {
+ /* nobody has access to @of yet, skip @of->mutex */
+ error = ops->open(of);
+ if (error)
+ goto err_put_node;
+ }
/* open succeeded, put active references */
kernfs_put_active(kn);
return 0;
-err_close:
+err_put_node:
+ kernfs_put_open_node(kn, of);
+err_seq_release:
seq_release(inode, file);
err_free:
kfree(of->prealloc_buf);
@@ -732,11 +742,41 @@ err_out:
return error;
}
+/* used from release/drain to ensure that ->release() is called exactly once */
+static void kernfs_release_file(struct kernfs_node *kn,
+ struct kernfs_open_file *of)
+{
+ /*
+ * @of is guaranteed to have no other file operations in flight and
+ * we just want to synchronize release and drain paths.
+ * @kernfs_open_file_mutex is enough. @of->mutex can't be used
+ * here because drain path may be called from places which can
+ * cause circular dependency.
+ */
+ lockdep_assert_held(&kernfs_open_file_mutex);
+
+ if (!of->released) {
+ /*
+ * A file is never detached without being released and we
+ * need to be able to release files which are deactivated
+ * and being drained. Don't use kernfs_ops().
+ */
+ kn->attr.ops->release(of);
+ of->released = true;
+ }
+}
+
static int kernfs_fop_release(struct inode *inode, struct file *filp)
{
struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
struct kernfs_open_file *of = kernfs_of(filp);
+ if (kn->flags & KERNFS_HAS_RELEASE) {
+ mutex_lock(&kernfs_open_file_mutex);
+ kernfs_release_file(kn, of);
+ mutex_unlock(&kernfs_open_file_mutex);
+ }
+
kernfs_put_open_node(kn, of);
seq_release(inode, filp);
kfree(of->prealloc_buf);
@@ -745,12 +785,12 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
return 0;
}
-void kernfs_unmap_bin_file(struct kernfs_node *kn)
+void kernfs_drain_open_files(struct kernfs_node *kn)
{
struct kernfs_open_node *on;
struct kernfs_open_file *of;
- if (!(kn->flags & KERNFS_HAS_MMAP))
+ if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
return;
spin_lock_irq(&kernfs_open_node_lock);
@@ -762,10 +802,16 @@ void kernfs_unmap_bin_file(struct kernfs_node *kn)
return;
mutex_lock(&kernfs_open_file_mutex);
+
list_for_each_entry(of, &on->files, list) {
struct inode *inode = file_inode(of->file);
- unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+
+ if (kn->flags & KERNFS_HAS_MMAP)
+ unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+
+ kernfs_release_file(kn, of);
}
+
mutex_unlock(&kernfs_open_file_mutex);
kernfs_put_open_node(kn, NULL);
@@ -964,6 +1010,8 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
kn->flags |= KERNFS_HAS_SEQ_SHOW;
if (ops->mmap)
kn->flags |= KERNFS_HAS_MMAP;
+ if (ops->release)
+ kn->flags |= KERNFS_HAS_RELEASE;
rc = kernfs_add_one(kn);
if (rc) {
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index bfd551bbf231..3100987cf8ba 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -104,7 +104,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
*/
extern const struct file_operations kernfs_file_fops;
-void kernfs_unmap_bin_file(struct kernfs_node *kn);
+void kernfs_drain_open_files(struct kernfs_node *kn);
/*
* symlink.c
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 861b4677fc5b..3c02404cfce9 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -148,14 +148,18 @@ struct cgroup_subsys_state {
* set for a task.
*/
struct css_set {
- /* Reference count */
- atomic_t refcount;
-
/*
- * List running through all cgroup groups in the same hash
- * slot. Protected by css_set_lock
+ * Set of subsystem states, one for each subsystem. This array is
+ * immutable after creation apart from the init_css_set during
+ * subsystem registration (at boot time).
*/
- struct hlist_node hlist;
+ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+
+ /* reference count */
+ atomic_t refcount;
+
+ /* the default cgroup associated with this css_set */
+ struct cgroup *dfl_cgrp;
/*
* Lists running through all tasks using this cgroup group.
@@ -167,21 +171,29 @@ struct css_set {
struct list_head tasks;
struct list_head mg_tasks;
+ /* all css_task_iters currently walking this cset */
+ struct list_head task_iters;
+
/*
- * List of cgrp_cset_links pointing at cgroups referenced from this
- * css_set. Protected by css_set_lock.
+ * On the default hierarhcy, ->subsys[ssid] may point to a css
+ * attached to an ancestor instead of the cgroup this css_set is
+ * associated with. The following node is anchored at
+ * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
+ * iterate through all css's attached to a given cgroup.
*/
- struct list_head cgrp_links;
+ struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
- /* the default cgroup associated with this css_set */
- struct cgroup *dfl_cgrp;
+ /*
+ * List running through all cgroup groups in the same hash
+ * slot. Protected by css_set_lock
+ */
+ struct hlist_node hlist;
/*
- * Set of subsystem states, one for each subsystem. This array is
- * immutable after creation apart from the init_css_set during
- * subsystem registration (at boot time).
+ * List of cgrp_cset_links pointing at cgroups referenced from this
+ * css_set. Protected by css_set_lock.
*/
- struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+ struct list_head cgrp_links;
/*
* List of csets participating in the on-going migration either as
@@ -201,18 +213,6 @@ struct css_set {
struct cgroup *mg_dst_cgrp;
struct css_set *mg_dst_cset;
- /*
- * On the default hierarhcy, ->subsys[ssid] may point to a css
- * attached to an ancestor instead of the cgroup this css_set is
- * associated with. The following node is anchored at
- * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
- * iterate through all css's attached to a given cgroup.
- */
- struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
-
- /* all css_task_iters currently walking this cset */
- struct list_head task_iters;
-
/* dead and being drained, ignore for migration */
bool dead;
@@ -388,6 +388,9 @@ struct cftype {
struct list_head node; /* anchored at ss->cfts */
struct kernfs_ops *kf_ops;
+ int (*open)(struct kernfs_open_file *of);
+ void (*release)(struct kernfs_open_file *of);
+
/*
* read_u64() is a shortcut for the common case of returning a
* single integer. Use it in place of read()
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c83c23f0577b..f6b43fbb141c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -266,7 +266,7 @@ void css_task_iter_end(struct css_task_iter *it);
* cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
* @leader: the loop cursor
* @dst_css: the destination css
- * @tset: takset to iterate
+ * @tset: taskset to iterate
*
* Iterate threadgroup leaders of @tset. For single-task migrations, @tset
* may not contain any.
diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h
new file mode 100644
index 000000000000..e94290b29e99
--- /dev/null
+++ b/include/linux/cgroup_rdma.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#ifndef _CGROUP_RDMA_H
+#define _CGROUP_RDMA_H
+
+#include <linux/cgroup.h>
+
+enum rdmacg_resource_type {
+ RDMACG_RESOURCE_HCA_HANDLE,
+ RDMACG_RESOURCE_HCA_OBJECT,
+ RDMACG_RESOURCE_MAX,
+};
+
+#ifdef CONFIG_CGROUP_RDMA
+
+struct rdma_cgroup {
+ struct cgroup_subsys_state css;
+
+ /*
+ * head to keep track of all resource pools
+ * that belongs to this cgroup.
+ */
+ struct list_head rpools;
+};
+
+struct rdmacg_device {
+ struct list_head dev_node;
+ struct list_head rpools;
+ char *name;
+};
+
+/*
+ * APIs for RDMA/IB stack to publish when a device wants to
+ * participate in resource accounting
+ */
+int rdmacg_register_device(struct rdmacg_device *device);
+void rdmacg_unregister_device(struct rdmacg_device *device);
+
+/* APIs for RDMA/IB stack to charge/uncharge pool specific resources */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+ struct rdmacg_device *device,
+ enum rdmacg_resource_type index);
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+ struct rdmacg_device *device,
+ enum rdmacg_resource_type index);
+#endif /* CONFIG_CGROUP_RDMA */
+#endif /* _CGROUP_RDMA_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 0df0336acee9..d0e597c44585 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -56,6 +56,10 @@ SUBSYS(hugetlb)
SUBSYS(pids)
#endif
+#if IS_ENABLED(CONFIG_CGROUP_RDMA)
+SUBSYS(rdma)
+#endif
+
/*
* The following subsystems are not supported on the default hierarchy.
*/
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index b2eb9c0a68c4..f8110051188f 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -105,29 +105,36 @@ struct ftrace_branch_data {
};
};
+struct ftrace_likely_data {
+ struct ftrace_branch_data data;
+ unsigned long constant;
+};
+
/*
* Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code
* to disable branch tracing on a per file basis.
*/
#if defined(CONFIG_TRACE_BRANCH_PROFILING) \
&& !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__)
-void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
+void ftrace_likely_update(struct ftrace_likely_data *f, int val,
+ int expect, int is_constant);
#define likely_notrace(x) __builtin_expect(!!(x), 1)
#define unlikely_notrace(x) __builtin_expect(!!(x), 0)
-#define __branch_check__(x, expect) ({ \
+#define __branch_check__(x, expect, is_constant) ({ \
int ______r; \
- static struct ftrace_branch_data \
+ static struct ftrace_likely_data \
__attribute__((__aligned__(4))) \
__attribute__((section("_ftrace_annotated_branch"))) \
______f = { \
- .func = __func__, \
- .file = __FILE__, \
- .line = __LINE__, \
+ .data.func = __func__, \
+ .data.file = __FILE__, \
+ .data.line = __LINE__, \
}; \
- ______r = likely_notrace(x); \
- ftrace_likely_update(&______f, ______r, expect); \
+ ______r = __builtin_expect(!!(x), expect); \
+ ftrace_likely_update(&______f, ______r, \
+ expect, is_constant); \
______r; \
})
@@ -137,10 +144,10 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
* written by Daniel Walker.
*/
# ifndef likely
-# define likely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1))
+# define likely(x) (__branch_check__(x, 1, __builtin_constant_p(x)))
# endif
# ifndef unlikely
-# define unlikely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0))
+# define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x)))
# endif
#ifdef CONFIG_PROFILE_ALL_BRANCHES
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index b63d6b7b0db0..8e06d758ee48 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -89,11 +89,17 @@ extern bool static_key_initialized;
struct static_key {
atomic_t enabled;
-/* Set lsb bit to 1 if branch is default true, 0 ot */
- struct jump_entry *entries;
-#ifdef CONFIG_MODULES
- struct static_key_mod *next;
-#endif
+/*
+ * bit 0 => 1 if key is initially true
+ * 0 if initially false
+ * bit 1 => 1 if points to struct static_key_mod
+ * 0 if points to struct jump_entry
+ */
+ union {
+ unsigned long type;
+ struct jump_entry *entries;
+ struct static_key_mod *next;
+ };
};
#else
@@ -118,9 +124,10 @@ struct module;
#ifdef HAVE_JUMP_LABEL
-#define JUMP_TYPE_FALSE 0UL
-#define JUMP_TYPE_TRUE 1UL
-#define JUMP_TYPE_MASK 1UL
+#define JUMP_TYPE_FALSE 0UL
+#define JUMP_TYPE_TRUE 1UL
+#define JUMP_TYPE_LINKED 2UL
+#define JUMP_TYPE_MASK 3UL
static __always_inline bool static_key_false(struct static_key *key)
{
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 7056238fd9f5..a9b11b8d06f2 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -46,6 +46,7 @@ enum kernfs_node_flag {
KERNFS_SUICIDAL = 0x0400,
KERNFS_SUICIDED = 0x0800,
KERNFS_EMPTY_DIR = 0x1000,
+ KERNFS_HAS_RELEASE = 0x2000,
};
/* @flags for kernfs_create_root() */
@@ -175,6 +176,7 @@ struct kernfs_open_file {
/* published fields */
struct kernfs_node *kn;
struct file *file;
+ struct seq_file *seq_file;
void *priv;
/* private fields, do not use outside kernfs proper */
@@ -185,12 +187,20 @@ struct kernfs_open_file {
char *prealloc_buf;
size_t atomic_write_len;
- bool mmapped;
+ bool mmapped:1;
+ bool released:1;
const struct vm_operations_struct *vm_ops;
};
struct kernfs_ops {
/*
+ * Optional open/release methods. Both are called with
+ * @of->seq_file populated.
+ */
+ int (*open)(struct kernfs_open_file *of);
+ void (*release)(struct kernfs_open_file *of);
+
+ /*
* Read is handled by either seq_file or raw_read().
*
* If seq_show() is present, seq_file path is active. Other seq
diff --git a/include/linux/mfd/tps65910.h b/include/linux/mfd/tps65910.h
index 6483a6fdce59..ffb21e79204d 100644
--- a/include/linux/mfd/tps65910.h
+++ b/include/linux/mfd/tps65910.h
@@ -134,6 +134,7 @@
/* RTC_CTRL_REG bitfields */
#define TPS65910_RTC_CTRL_STOP_RTC 0x01 /*0=stop, 1=run */
+#define TPS65910_RTC_CTRL_AUTO_COMP 0x04
#define TPS65910_RTC_CTRL_GET_TIME 0x40
/* RTC_STATUS_REG bitfields */
diff --git a/include/linux/platform_data/rtc-m48t86.h b/include/linux/platform_data/rtc-m48t86.h
deleted file mode 100644
index 915d6b4f0f89..000000000000
--- a/include/linux/platform_data/rtc-m48t86.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * ST M48T86 / Dallas DS12887 RTC driver
- * Copyright (c) 2006 Tower Technologies
- *
- * Author: Alessandro Zummo <a.zummo@towertech.it>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-struct m48t86_ops
-{
- void (*writebyte)(unsigned char value, unsigned long addr);
- unsigned char (*readbyte)(unsigned long addr);
-};
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 5a209b84fd9e..c7bdf895179c 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -61,6 +61,8 @@ struct timer_list {
#define TIMER_ARRAYSHIFT 22
#define TIMER_ARRAYMASK 0xFFC00000
+#define TIMER_TRACE_FLAGMASK (TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE)
+
#define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
.entry = { .next = TIMER_ENTRY_STATIC }, \
.function = (_function), \
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a26cc437293c..bde063cefd04 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -106,9 +106,9 @@ struct work_struct {
#endif
};
-#define WORK_DATA_INIT() ATOMIC_LONG_INIT(WORK_STRUCT_NO_POOL)
+#define WORK_DATA_INIT() ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL)
#define WORK_DATA_STATIC_INIT() \
- ATOMIC_LONG_INIT(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC)
+ ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC))
struct delayed_work {
struct work_struct work;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index d84849c5dc05..0f1813c13687 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -60,6 +60,7 @@
#include <linux/atomic.h>
#include <linux/mmu_notifier.h>
#include <linux/uaccess.h>
+#include <linux/cgroup_rdma.h>
extern struct workqueue_struct *ib_wq;
extern struct workqueue_struct *ib_comp_wq;
@@ -1356,6 +1357,12 @@ struct ib_fmr_attr {
struct ib_umem;
+struct ib_rdmacg_object {
+#ifdef CONFIG_CGROUP_RDMA
+ struct rdma_cgroup *cg; /* owner rdma cgroup */
+#endif
+};
+
struct ib_ucontext {
struct ib_device *device;
struct list_head pd_list;
@@ -1388,6 +1395,8 @@ struct ib_ucontext {
struct list_head no_private_counters;
int odp_mrs_count;
#endif
+
+ struct ib_rdmacg_object cg_obj;
};
struct ib_uobject {
@@ -1395,6 +1404,7 @@ struct ib_uobject {
struct ib_ucontext *context; /* associated user context */
void *object; /* containing object */
struct list_head list; /* link to context's list */
+ struct ib_rdmacg_object cg_obj; /* rdmacg object */
int id; /* index into kernel idr */
struct kref ref;
struct rw_semaphore mutex; /* protects .live */
@@ -2128,6 +2138,10 @@ struct ib_device {
struct attribute_group *hw_stats_ag;
struct rdma_hw_stats *hw_stats;
+#ifdef CONFIG_CGROUP_RDMA
+ struct rdmacg_device cg_device;
+#endif
+
/**
* The following mandatory functions are used only at device
* registration. Keep functions such as these at the end of this
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index 1bca99dbb98f..80787eafba99 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -36,6 +36,13 @@ DEFINE_EVENT(timer_class, timer_init,
TP_ARGS(timer)
);
+#define decode_timer_flags(flags) \
+ __print_flags(flags, "|", \
+ { TIMER_MIGRATING, "M" }, \
+ { TIMER_DEFERRABLE, "D" }, \
+ { TIMER_PINNED, "P" }, \
+ { TIMER_IRQSAFE, "I" })
+
/**
* timer_start - called when the timer is started
* @timer: pointer to struct timer_list
@@ -65,9 +72,12 @@ TRACE_EVENT(timer_start,
__entry->flags = flags;
),
- TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld] flags=0x%08x",
+ TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld] cpu=%u idx=%u flags=%s",
__entry->timer, __entry->function, __entry->expires,
- (long)__entry->expires - __entry->now, __entry->flags)
+ (long)__entry->expires - __entry->now,
+ __entry->flags & TIMER_CPUMASK,
+ __entry->flags >> TIMER_ARRAYSHIFT,
+ decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK))
);
/**
diff --git a/init/Kconfig b/init/Kconfig
index 8c39615165b7..a92f27da4a27 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1078,6 +1078,16 @@ config CGROUP_PIDS
since the PIDs limit only affects a process's ability to fork, not to
attach to a cgroup.
+config CGROUP_RDMA
+ bool "RDMA controller"
+ help
+ Provides enforcement of RDMA resources defined by IB stack.
+ It is fairly easy for consumers to exhaust RDMA resources, which
+ can result into resource unavailability to other consumers.
+ RDMA controller is designed to stop this from happening.
+ Attaching processes with active RDMA resources to the cgroup
+ hierarchy is allowed even if can cross the hierarchy's limit.
+
config CGROUP_FREEZER
bool "Freezer controller"
help
diff --git a/kernel/Makefile b/kernel/Makefile
index 12c679f769c6..b302b4731d16 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -64,10 +64,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
-obj-$(CONFIG_CGROUPS) += cgroup.o
-obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
-obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
-obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUPS) += cgroup/
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
new file mode 100644
index 000000000000..387348a40c64
--- /dev/null
+++ b/kernel/cgroup/Makefile
@@ -0,0 +1,6 @@
+obj-y := cgroup.o namespace.o cgroup-v1.o
+
+obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += pids.o
+obj-$(CONFIG_CGROUP_RDMA) += rdma.o
+obj-$(CONFIG_CPUSETS) += cpuset.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
new file mode 100644
index 000000000000..9203bfb05603
--- /dev/null
+++ b/kernel/cgroup/cgroup-internal.h
@@ -0,0 +1,214 @@
+#ifndef __CGROUP_INTERNAL_H
+#define __CGROUP_INTERNAL_H
+
+#include <linux/cgroup.h>
+#include <linux/kernfs.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies. In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+ /* the cgroup and css_set this link associates */
+ struct cgroup *cgrp;
+ struct css_set *cset;
+
+ /* list of cgrp_cset_links anchored at cgrp->cset_links */
+ struct list_head cset_link;
+
+ /* list of cgrp_cset_links anchored at css_set->cgrp_links */
+ struct list_head cgrp_link;
+};
+
+/* used to track tasks and csets during migration */
+struct cgroup_taskset {
+ /* the src and dst cset list running through cset->mg_node */
+ struct list_head src_csets;
+ struct list_head dst_csets;
+
+ /* the subsys currently being processed */
+ int ssid;
+
+ /*
+ * Fields for cgroup_taskset_*() iteration.
+ *
+ * Before migration is committed, the target migration tasks are on
+ * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
+ * the csets on ->dst_csets. ->csets point to either ->src_csets
+ * or ->dst_csets depending on whether migration is committed.
+ *
+ * ->cur_csets and ->cur_task point to the current task position
+ * during iteration.
+ */
+ struct list_head *csets;
+ struct css_set *cur_cset;
+ struct task_struct *cur_task;
+};
+
+/* migration context also tracks preloading */
+struct cgroup_mgctx {
+ /*
+ * Preloaded source and destination csets. Used to guarantee
+ * atomic success or failure on actual migration.
+ */
+ struct list_head preloaded_src_csets;
+ struct list_head preloaded_dst_csets;
+
+ /* tasks and csets to migrate */
+ struct cgroup_taskset tset;
+
+ /* subsystems affected by migration */
+ u16 ss_mask;
+};
+
+#define CGROUP_TASKSET_INIT(tset) \
+{ \
+ .src_csets = LIST_HEAD_INIT(tset.src_csets), \
+ .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \
+ .csets = &tset.src_csets, \
+}
+
+#define CGROUP_MGCTX_INIT(name) \
+{ \
+ LIST_HEAD_INIT(name.preloaded_src_csets), \
+ LIST_HEAD_INIT(name.preloaded_dst_csets), \
+ CGROUP_TASKSET_INIT(name.tset), \
+}
+
+#define DEFINE_CGROUP_MGCTX(name) \
+ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
+
+struct cgroup_sb_opts {
+ u16 subsys_mask;
+ unsigned int flags;
+ char *release_agent;
+ bool cpuset_clone_children;
+ char *name;
+ /* User explicitly requested empty subsystem */
+ bool none;
+};
+
+extern struct mutex cgroup_mutex;
+extern spinlock_t css_set_lock;
+extern struct cgroup_subsys *cgroup_subsys[];
+extern struct list_head cgroup_roots;
+extern struct file_system_type cgroup_fs_type;
+
+/* iterate across the hierarchies */
+#define for_each_root(root) \
+ list_for_each_entry((root), &cgroup_roots, root_list)
+
+/**
+ * for_each_subsys - iterate all enabled cgroup subsystems
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ */
+#define for_each_subsys(ss, ssid) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
+ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
+{
+ return !(cgrp->self.flags & CSS_ONLINE);
+}
+
+static inline bool notify_on_release(const struct cgroup *cgrp)
+{
+ return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+}
+
+void put_css_set_locked(struct css_set *cset);
+
+static inline void put_css_set(struct css_set *cset)
+{
+ unsigned long flags;
+
+ /*
+ * Ensure that the refcount doesn't hit zero while any readers
+ * can see it. Similar to atomic_dec_and_lock(), but for an
+ * rwlock
+ */
+ if (atomic_add_unless(&cset->refcount, -1, 1))
+ return;
+
+ spin_lock_irqsave(&css_set_lock, flags);
+ put_css_set_locked(cset);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+}
+
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cset)
+{
+ atomic_inc(&cset->refcount);
+}
+
+bool cgroup_ssid_enabled(int ssid);
+bool cgroup_on_dfl(const struct cgroup *cgrp);
+
+struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
+struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ struct cgroup_root *root);
+struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline);
+void cgroup_kn_unlock(struct kernfs_node *kn);
+int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+ struct cgroup_namespace *ns);
+
+void cgroup_free_root(struct cgroup_root *root);
+void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
+int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
+ struct cgroup_root *root, unsigned long magic,
+ struct cgroup_namespace *ns);
+
+bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
+void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
+void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
+ struct cgroup_mgctx *mgctx);
+int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx);
+int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+ struct cgroup_mgctx *mgctx);
+
+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+ bool threadgroup);
+ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off, bool threadgroup);
+ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+ loff_t off);
+
+void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
+
+int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode);
+int cgroup_rmdir(struct kernfs_node *kn);
+int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+ struct kernfs_root *kf_root);
+
+/*
+ * namespace.c
+ */
+extern const struct proc_ns_operations cgroupns_operations;
+
+/*
+ * cgroup-v1.c
+ */
+extern struct cftype cgroup1_base_files[];
+extern const struct file_operations proc_cgroupstats_operations;
+extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
+
+bool cgroup1_ssid_disabled(int ssid);
+void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
+void cgroup1_release_agent(struct work_struct *work);
+void cgroup1_check_for_release(struct cgroup *cgrp);
+struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
+ void *data, unsigned long magic,
+ struct cgroup_namespace *ns);
+
+#endif /* __CGROUP_INTERNAL_H */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
new file mode 100644
index 000000000000..fc34bcf2329f
--- /dev/null
+++ b/kernel/cgroup/cgroup-v1.c
@@ -0,0 +1,1395 @@
+#include "cgroup-internal.h"
+
+#include <linux/ctype.h>
+#include <linux/kmod.h>
+#include <linux/sort.h>
+#include <linux/delay.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/delayacct.h>
+#include <linux/pid_namespace.h>
+#include <linux/cgroupstats.h>
+
+#include <trace/events/cgroup.h>
+
+/*
+ * pidlists linger the following amount before being destroyed. The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY HZ
+
+/* Controllers blocked by the commandline in v1 */
+static u16 cgroup_no_v1_mask;
+
+/*
+ * pidlist destructions need to be flushed on cgroup destruction. Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
+
+/*
+ * Protects cgroup_subsys->release_agent_path. Modifying it also requires
+ * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
+ */
+static DEFINE_SPINLOCK(release_agent_path_lock);
+
+bool cgroup1_ssid_disabled(int ssid)
+{
+ return cgroup_no_v1_mask & (1 << ssid);
+}
+
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+{
+ struct cgroup_root *root;
+ int retval = 0;
+
+ mutex_lock(&cgroup_mutex);
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+ for_each_root(root) {
+ struct cgroup *from_cgrp;
+
+ if (root == &cgrp_dfl_root)
+ continue;
+
+ spin_lock_irq(&css_set_lock);
+ from_cgrp = task_cgroup_from_root(from, root);
+ spin_unlock_irq(&css_set_lock);
+
+ retval = cgroup_attach_task(from_cgrp, tsk, false);
+ if (retval)
+ break;
+ }
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ mutex_unlock(&cgroup_mutex);
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
+
+/**
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
+ *
+ * Locking rules between cgroup_post_fork() and the migration path
+ * guarantee that, if a task is forking while being migrated, the new child
+ * is guaranteed to be either visible in the source cgroup after the
+ * parent's migration is complete or put into the target cgroup. No task
+ * can slip out of migration through forking.
+ */
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+{
+ DEFINE_CGROUP_MGCTX(mgctx);
+ struct cgrp_cset_link *link;
+ struct css_task_iter it;
+ struct task_struct *task;
+ int ret;
+
+ if (cgroup_on_dfl(to))
+ return -EINVAL;
+
+ if (!cgroup_may_migrate_to(to))
+ return -EBUSY;
+
+ mutex_lock(&cgroup_mutex);
+
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+
+ /* all tasks in @from are being moved, all csets are source */
+ spin_lock_irq(&css_set_lock);
+ list_for_each_entry(link, &from->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, to, &mgctx);
+ spin_unlock_irq(&css_set_lock);
+
+ ret = cgroup_migrate_prepare_dst(&mgctx);
+ if (ret)
+ goto out_err;
+
+ /*
+ * Migrate tasks one-by-one until @from is empty. This fails iff
+ * ->can_attach() fails.
+ */
+ do {
+ css_task_iter_start(&from->self, &it);
+ task = css_task_iter_next(&it);
+ if (task)
+ get_task_struct(task);
+ css_task_iter_end(&it);
+
+ if (task) {
+ ret = cgroup_migrate(task, false, &mgctx);
+ if (!ret)
+ trace_cgroup_transfer_tasks(to, task, false);
+ put_task_struct(task);
+ }
+ } while (task && !ret);
+out_err:
+ cgroup_migrate_finish(&mgctx);
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
+/*
+ * Stuff for reading the 'tasks'/'procs' files.
+ *
+ * Reading this file can return large amounts of data if a cgroup has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
+ */
+
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+ CGROUP_FILE_PROCS,
+ CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+ /*
+ * used to find which pidlist is wanted. doesn't change as long as
+ * this particular list stays in the list.
+ */
+ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+ /* array of xids */
+ pid_t *list;
+ /* how many elements the above list has */
+ int length;
+ /* each of these stored in a list by its cgroup */
+ struct list_head links;
+ /* pointer to the cgroup we belong to, for list removal purposes */
+ struct cgroup *owner;
+ /* for delayed destruction */
+ struct delayed_work destroy_dwork;
+};
+
+/*
+ * The following two functions "fix" the issue where there are more pids
+ * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
+ * TODO: replace with a kernel-wide solution to this problem
+ */
+#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
+static void *pidlist_allocate(int count)
+{
+ if (PIDLIST_TOO_LARGE(count))
+ return vmalloc(count * sizeof(pid_t));
+ else
+ return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
+}
+
+static void pidlist_free(void *p)
+{
+ kvfree(p);
+}
+
+/*
+ * Used to destroy all pidlists lingering waiting for destroy timer. None
+ * should be left afterwards.
+ */
+void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
+{
+ struct cgroup_pidlist *l, *tmp_l;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+ list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+ mutex_unlock(&cgrp->pidlist_mutex);
+
+ flush_workqueue(cgroup_pidlist_destroy_wq);
+ BUG_ON(!list_empty(&cgrp->pidlists));
+}
+
+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+ destroy_dwork);
+ struct cgroup_pidlist *tofree = NULL;
+
+ mutex_lock(&l->owner->pidlist_mutex);
+
+ /*
+ * Destroy iff we didn't get queued again. The state won't change
+ * as destroy_dwork can only be queued while locked.
+ */
+ if (!delayed_work_pending(dwork)) {
+ list_del(&l->links);
+ pidlist_free(l->list);
+ put_pid_ns(l->key.ns);
+ tofree = l;
+ }
+
+ mutex_unlock(&l->owner->pidlist_mutex);
+ kfree(tofree);
+}
+
+/*
+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
+ * Returns the number of unique elements.
+ */
+static int pidlist_uniq(pid_t *list, int length)
+{
+ int src, dest = 1;
+
+ /*
+ * we presume the 0th element is unique, so i starts at 1. trivial
+ * edge cases first; no work needs to be done for either
+ */
+ if (length == 0 || length == 1)
+ return length;
+ /* src and dest walk down the list; dest counts unique elements */
+ for (src = 1; src < length; src++) {
+ /* find next unique element */
+ while (list[src] == list[src-1]) {
+ src++;
+ if (src == length)
+ goto after;
+ }
+ /* dest always points to where the next unique element goes */
+ list[dest] = list[src];
+ dest++;
+ }
+after:
+ return dest;
+}
+
+/*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco. As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer. As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ */
+static int cmppid(const void *a, const void *b)
+{
+ return *(pid_t *)a - *(pid_t *)b;
+}
+
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+ enum cgroup_filetype type)
+{
+ struct cgroup_pidlist *l;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ struct pid_namespace *ns = task_active_pid_ns(current);
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ list_for_each_entry(l, &cgrp->pidlists, links)
+ if (l->key.type == type && l->key.ns == ns)
+ return l;
+ return NULL;
+}
+
+/*
+ * find the appropriate pidlist for our purpose (given procs vs tasks)
+ * returns with the lock on that pidlist already held, and takes care
+ * of the use count, or returns NULL with no locks held if we're out of
+ * memory.
+ */
+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
+ enum cgroup_filetype type)
+{
+ struct cgroup_pidlist *l;
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ l = cgroup_pidlist_find(cgrp, type);
+ if (l)
+ return l;
+
+ /* entry not found; create a new one */
+ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+ if (!l)
+ return l;
+
+ INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
+ l->key.type = type;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ l->key.ns = get_pid_ns(task_active_pid_ns(current));
+ l->owner = cgrp;
+ list_add(&l->links, &cgrp->pidlists);
+ return l;
+}
+
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ *
+ * Return the number of tasks in the cgroup. The returned number can be
+ * higher than the actual number of tasks due to css_set references from
+ * namespace roots and temporary usages.
+ */
+static int cgroup_task_count(const struct cgroup *cgrp)
+{
+ int count = 0;
+ struct cgrp_cset_link *link;
+
+ spin_lock_irq(&css_set_lock);
+ list_for_each_entry(link, &cgrp->cset_links, cset_link)
+ count += atomic_read(&link->cset->refcount);
+ spin_unlock_irq(&css_set_lock);
+ return count;
+}
+
+/*
+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
+ */
+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
+ struct cgroup_pidlist **lp)
+{
+ pid_t *array;
+ int length;
+ int pid, n = 0; /* used for populating the array */
+ struct css_task_iter it;
+ struct task_struct *tsk;
+ struct cgroup_pidlist *l;
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ /*
+ * If cgroup gets more users after we read count, we won't have
+ * enough space - tough. This race is indistinguishable to the
+ * caller from the case that the additional cgroup users didn't
+ * show up until sometime later on.
+ */
+ length = cgroup_task_count(cgrp);
+ array = pidlist_allocate(length);
+ if (!array)
+ return -ENOMEM;
+ /* now, populate the array */
+ css_task_iter_start(&cgrp->self, &it);
+ while ((tsk = css_task_iter_next(&it))) {
+ if (unlikely(n == length))
+ break;
+ /* get tgid or pid for procs or tasks file respectively */
+ if (type == CGROUP_FILE_PROCS)
+ pid = task_tgid_vnr(tsk);
+ else
+ pid = task_pid_vnr(tsk);
+ if (pid > 0) /* make sure to only use valid results */
+ array[n++] = pid;
+ }
+ css_task_iter_end(&it);
+ length = n;
+ /* now sort & (if procs) strip out duplicates */
+ sort(array, length, sizeof(pid_t), cmppid, NULL);
+ if (type == CGROUP_FILE_PROCS)
+ length = pidlist_uniq(array, length);
+
+ l = cgroup_pidlist_find_create(cgrp, type);
+ if (!l) {
+ pidlist_free(array);
+ return -ENOMEM;
+ }
+
+ /* store array, freeing old if necessary */
+ pidlist_free(l->list);
+ l->list = array;
+ l->length = length;
+ *lp = l;
+ return 0;
+}
+
+/*
+ * seq_file methods for the tasks/procs files. The seq_file position is the
+ * next pid to display; the seq_file iterator is a pointer to the pid
+ * in the cgroup->l->list array.
+ */
+
+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
+{
+ /*
+ * Initially we receive a position value that corresponds to
+ * one more than the last pid shown (or 0 on the first call or
+ * after a seek to the start). Use a binary-search to find the
+ * next pid to display, if any
+ */
+ struct kernfs_open_file *of = s->private;
+ struct cgroup *cgrp = seq_css(s)->cgroup;
+ struct cgroup_pidlist *l;
+ enum cgroup_filetype type = seq_cft(s)->private;
+ int index = 0, pid = *pos;
+ int *iter, ret;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+
+ /*
+ * !NULL @of->priv indicates that this isn't the first start()
+ * after open. If the matching pidlist is around, we can use that.
+ * Look for it. Note that @of->priv can't be used directly. It
+ * could already have been destroyed.
+ */
+ if (of->priv)
+ of->priv = cgroup_pidlist_find(cgrp, type);
+
+ /*
+ * Either this is the first start() after open or the matching
+ * pidlist has been destroyed inbetween. Create a new one.
+ */
+ if (!of->priv) {
+ ret = pidlist_array_load(cgrp, type,
+ (struct cgroup_pidlist **)&of->priv);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ l = of->priv;
+
+ if (pid) {
+ int end = l->length;
+
+ while (index < end) {
+ int mid = (index + end) / 2;
+ if (l->list[mid] == pid) {
+ index = mid;
+ break;
+ } else if (l->list[mid] <= pid)
+ index = mid + 1;
+ else
+ end = mid;
+ }
+ }
+ /* If we're off the end of the array, we're done */
+ if (index >= l->length)
+ return NULL;
+ /* Update the abstract position to be the actual pid that we found */
+ iter = l->list + index;
+ *pos = *iter;
+ return iter;
+}
+
+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
+{
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_pidlist *l = of->priv;
+
+ if (l)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+ CGROUP_PIDLIST_DESTROY_DELAY);
+ mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
+}
+
+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_pidlist *l = of->priv;
+ pid_t *p = v;
+ pid_t *end = l->list + l->length;
+ /*
+ * Advance to the next pid in the array. If this goes off the
+ * end, we're done
+ */
+ p++;
+ if (p >= end) {
+ return NULL;
+ } else {
+ *pos = *p;
+ return p;
+ }
+}
+
+static int cgroup_pidlist_show(struct seq_file *s, void *v)
+{
+ seq_printf(s, "%d\n", *(int *)v);
+
+ return 0;
+}
+
+static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup_procs_write(of, buf, nbytes, off, false);
+}
+
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+
+ BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+ spin_lock(&release_agent_path_lock);
+ strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+ sizeof(cgrp->root->release_agent_path));
+ spin_unlock(&release_agent_path_lock);
+ cgroup_kn_unlock(of->kn);
+ return nbytes;
+}
+
+static int cgroup_release_agent_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ spin_lock(&release_agent_path_lock);
+ seq_puts(seq, cgrp->root->release_agent_path);
+ spin_unlock(&release_agent_path_lock);
+ seq_putc(seq, '\n');
+ return 0;
+}
+
+static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
+{
+ seq_puts(seq, "0\n");
+ return 0;
+}
+
+static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return notify_on_release(css->cgroup);
+}
+
+static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ if (val)
+ set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+ else
+ clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+ return 0;
+}
+
+static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+}
+
+static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ if (val)
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+ else
+ clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+ return 0;
+}
+
+/* cgroup core interface files for the legacy hierarchies */
+struct cftype cgroup1_base_files[] = {
+ {
+ .name = "cgroup.procs",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_PROCS,
+ .write = cgroup_procs_write,
+ },
+ {
+ .name = "cgroup.clone_children",
+ .read_u64 = cgroup_clone_children_read,
+ .write_u64 = cgroup_clone_children_write,
+ },
+ {
+ .name = "cgroup.sane_behavior",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_sane_behavior_show,
+ },
+ {
+ .name = "tasks",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_TASKS,
+ .write = cgroup_tasks_write,
+ },
+ {
+ .name = "notify_on_release",
+ .read_u64 = cgroup_read_notify_on_release,
+ .write_u64 = cgroup_write_notify_on_release,
+ },
+ {
+ .name = "release_agent",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_release_agent_show,
+ .write = cgroup_release_agent_write,
+ .max_write_len = PATH_MAX - 1,
+ },
+ { } /* terminate */
+};
+
+/* Display information about each subsystem and each hierarchy */
+static int proc_cgroupstats_show(struct seq_file *m, void *v)
+{
+ struct cgroup_subsys *ss;
+ int i;
+
+ seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+ /*
+ * ideally we don't want subsystems moving around while we do this.
+ * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+ * subsys/hierarchy state.
+ */
+ mutex_lock(&cgroup_mutex);
+
+ for_each_subsys(ss, i)
+ seq_printf(m, "%s\t%d\t%d\t%d\n",
+ ss->legacy_name, ss->root->hierarchy_id,
+ atomic_read(&ss->root->nr_cgrps),
+ cgroup_ssid_enabled(i));
+
+ mutex_unlock(&cgroup_mutex);
+ return 0;
+}
+
+static int cgroupstats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_cgroupstats_show, NULL);
+}
+
+const struct file_operations proc_cgroupstats_operations = {
+ .open = cgroupstats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/**
+ * cgroupstats_build - build and fill cgroupstats
+ * @stats: cgroupstats to fill information into
+ * @dentry: A dentry entry belonging to the cgroup for which stats have
+ * been requested.
+ *
+ * Build and fill cgroupstats so that taskstats can export it to user
+ * space.
+ */
+int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
+{
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+ struct cgroup *cgrp;
+ struct css_task_iter it;
+ struct task_struct *tsk;
+
+ /* it should be kernfs_node belonging to cgroupfs and is a directory */
+ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+ kernfs_type(kn) != KERNFS_DIR)
+ return -EINVAL;
+
+ mutex_lock(&cgroup_mutex);
+
+ /*
+ * We aren't being called from kernfs and there's no guarantee on
+ * @kn->priv's validity. For this and css_tryget_online_from_dir(),
+ * @kn->priv is RCU safe. Let's do the RCU dancing.
+ */
+ rcu_read_lock();
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (!cgrp || cgroup_is_dead(cgrp)) {
+ rcu_read_unlock();
+ mutex_unlock(&cgroup_mutex);
+ return -ENOENT;
+ }
+ rcu_read_unlock();
+
+ css_task_iter_start(&cgrp->self, &it);
+ while ((tsk = css_task_iter_next(&it))) {
+ switch (tsk->state) {
+ case TASK_RUNNING:
+ stats->nr_running++;
+ break;
+ case TASK_INTERRUPTIBLE:
+ stats->nr_sleeping++;
+ break;
+ case TASK_UNINTERRUPTIBLE:
+ stats->nr_uninterruptible++;
+ break;
+ case TASK_STOPPED:
+ stats->nr_stopped++;
+ break;
+ default:
+ if (delayacct_is_task_waiting_on_io(tsk))
+ stats->nr_io_wait++;
+ break;
+ }
+ }
+ css_task_iter_end(&it);
+
+ mutex_unlock(&cgroup_mutex);
+ return 0;
+}
+
+void cgroup1_check_for_release(struct cgroup *cgrp)
+{
+ if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
+ !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
+ schedule_work(&cgrp->release_agent_work);
+}
+
+/*
+ * Notify userspace when a cgroup is released, by running the
+ * configured release agent with the name of the cgroup (path
+ * relative to the root of cgroup file system) as the argument.
+ *
+ * Most likely, this user command will try to rmdir this cgroup.
+ *
+ * This races with the possibility that some other task will be
+ * attached to this cgroup before it is removed, or that some other
+ * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
+ * The presumed 'rmdir' will fail quietly if this cgroup is no longer
+ * unused, and this cgroup will be reprieved from its death sentence,
+ * to continue to serve a useful existence. Next time it's released,
+ * we will get notified again, if it still has 'notify_on_release' set.
+ *
+ * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
+ * means only wait until the task is successfully execve()'d. The
+ * separate release agent task is forked by call_usermodehelper(),
+ * then control in this thread returns here, without waiting for the
+ * release agent task. We don't bother to wait because the caller of
+ * this routine has no use for the exit status of the release agent
+ * task, so no sense holding our caller up for that.
+ */
+void cgroup1_release_agent(struct work_struct *work)
+{
+ struct cgroup *cgrp =
+ container_of(work, struct cgroup, release_agent_work);
+ char *pathbuf = NULL, *agentbuf = NULL;
+ char *argv[3], *envp[3];
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+
+ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+ agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+ if (!pathbuf || !agentbuf)
+ goto out;
+
+ spin_lock_irq(&css_set_lock);
+ ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+ spin_unlock_irq(&css_set_lock);
+ if (ret < 0 || ret >= PATH_MAX)
+ goto out;
+
+ argv[0] = agentbuf;
+ argv[1] = pathbuf;
+ argv[2] = NULL;
+
+ /* minimal command environment */
+ envp[0] = "HOME=/";
+ envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+ envp[2] = NULL;
+
+ mutex_unlock(&cgroup_mutex);
+ call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ goto out_free;
+out:
+ mutex_unlock(&cgroup_mutex);
+out_free:
+ kfree(agentbuf);
+ kfree(pathbuf);
+}
+
+/*
+ * cgroup_rename - Only allow simple rename of directories in place.
+ */
+static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+ const char *new_name_str)
+{
+ struct cgroup *cgrp = kn->priv;
+ int ret;
+
+ if (kernfs_type(kn) != KERNFS_DIR)
+ return -ENOTDIR;
+ if (kn->parent != new_parent)
+ return -EIO;
+
+ /*
+ * We're gonna grab cgroup_mutex which nests outside kernfs
+ * active_ref. kernfs_rename() doesn't require active_ref
+ * protection. Break them before grabbing cgroup_mutex.
+ */
+ kernfs_break_active_protection(new_parent);
+ kernfs_break_active_protection(kn);
+
+ mutex_lock(&cgroup_mutex);
+
+ ret = kernfs_rename(kn, new_parent, new_name_str);
+ if (!ret)
+ trace_cgroup_rename(cgrp);
+
+ mutex_unlock(&cgroup_mutex);
+
+ kernfs_unbreak_active_protection(kn);
+ kernfs_unbreak_active_protection(new_parent);
+ return ret;
+}
+
+static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
+{
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_show_option(seq, ss->legacy_name, NULL);
+ if (root->flags & CGRP_ROOT_NOPREFIX)
+ seq_puts(seq, ",noprefix");
+ if (root->flags & CGRP_ROOT_XATTR)
+ seq_puts(seq, ",xattr");
+
+ spin_lock(&release_agent_path_lock);
+ if (strlen(root->release_agent_path))
+ seq_show_option(seq, "release_agent",
+ root->release_agent_path);
+ spin_unlock(&release_agent_path_lock);
+
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
+ seq_puts(seq, ",clone_children");
+ if (strlen(root->name))
+ seq_show_option(seq, "name", root->name);
+ return 0;
+}
+
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
+{
+ char *token, *o = data;
+ bool all_ss = false, one_ss = false;
+ u16 mask = U16_MAX;
+ struct cgroup_subsys *ss;
+ int nr_opts = 0;
+ int i;
+
+#ifdef CONFIG_CPUSETS
+ mask = ~((u16)1 << cpuset_cgrp_id);
+#endif
+
+ memset(opts, 0, sizeof(*opts));
+
+ while ((token = strsep(&o, ",")) != NULL) {
+ nr_opts++;
+
+ if (!*token)
+ return -EINVAL;
+ if (!strcmp(token, "none")) {
+ /* Explicitly have no subsystems */
+ opts->none = true;
+ continue;
+ }
+ if (!strcmp(token, "all")) {
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (one_ss)
+ return -EINVAL;
+ all_ss = true;
+ continue;
+ }
+ if (!strcmp(token, "noprefix")) {
+ opts->flags |= CGRP_ROOT_NOPREFIX;
+ continue;
+ }
+ if (!strcmp(token, "clone_children")) {
+ opts->cpuset_clone_children = true;
+ continue;
+ }
+ if (!strcmp(token, "xattr")) {
+ opts->flags |= CGRP_ROOT_XATTR;
+ continue;
+ }
+ if (!strncmp(token, "release_agent=", 14)) {
+ /* Specifying two release agents is forbidden */
+ if (opts->release_agent)
+ return -EINVAL;
+ opts->release_agent =
+ kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
+ if (!opts->release_agent)
+ return -ENOMEM;
+ continue;
+ }
+ if (!strncmp(token, "name=", 5)) {
+ const char *name = token + 5;
+ /* Can't specify an empty name */
+ if (!strlen(name))
+ return -EINVAL;
+ /* Must match [\w.-]+ */
+ for (i = 0; i < strlen(name); i++) {
+ char c = name[i];
+ if (isalnum(c))
+ continue;
+ if ((c == '.') || (c == '-') || (c == '_'))
+ continue;
+ return -EINVAL;
+ }
+ /* Specifying two names is forbidden */
+ if (opts->name)
+ return -EINVAL;
+ opts->name = kstrndup(name,
+ MAX_CGROUP_ROOT_NAMELEN - 1,
+ GFP_KERNEL);
+ if (!opts->name)
+ return -ENOMEM;
+
+ continue;
+ }
+
+ for_each_subsys(ss, i) {
+ if (strcmp(token, ss->legacy_name))
+ continue;
+ if (!cgroup_ssid_enabled(i))
+ continue;
+ if (cgroup1_ssid_disabled(i))
+ continue;
+
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (all_ss)
+ return -EINVAL;
+ opts->subsys_mask |= (1 << i);
+ one_ss = true;
+
+ break;
+ }
+ if (i == CGROUP_SUBSYS_COUNT)
+ return -ENOENT;
+ }
+
+ /*
+ * If the 'all' option was specified select all the subsystems,
+ * otherwise if 'none', 'name=' and a subsystem name options were
+ * not specified, let's default to 'all'
+ */
+ if (all_ss || (!one_ss && !opts->none && !opts->name))
+ for_each_subsys(ss, i)
+ if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
+ opts->subsys_mask |= (1 << i);
+
+ /*
+ * We either have to specify by name or by subsystems. (So all
+ * empty hierarchies must have a name).
+ */
+ if (!opts->subsys_mask && !opts->name)
+ return -EINVAL;
+
+ /*
+ * Option noprefix was introduced just for backward compatibility
+ * with the old cpuset, so we allow noprefix only if mounting just
+ * the cpuset subsystem.
+ */
+ if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
+ return -EINVAL;
+
+ /* Can't specify "none" and some subsystems */
+ if (opts->subsys_mask && opts->none)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
+{
+ int ret = 0;
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+ struct cgroup_sb_opts opts;
+ u16 added_mask, removed_mask;
+
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+ /* See what subsystems are wanted */
+ ret = parse_cgroupfs_options(data, &opts);
+ if (ret)
+ goto out_unlock;
+
+ if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+ pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
+ task_tgid_nr(current), current->comm);
+
+ added_mask = opts.subsys_mask & ~root->subsys_mask;
+ removed_mask = root->subsys_mask & ~opts.subsys_mask;
+
+ /* Don't allow flags or name to change at remount */
+ if ((opts.flags ^ root->flags) ||
+ (opts.name && strcmp(opts.name, root->name))) {
+ pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
+ opts.flags, opts.name ?: "", root->flags, root->name);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* remounting is not allowed for populated hierarchies */
+ if (!list_empty(&root->cgrp.self.children)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ ret = rebind_subsystems(root, added_mask);
+ if (ret)
+ goto out_unlock;
+
+ WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
+
+ if (opts.release_agent) {
+ spin_lock(&release_agent_path_lock);
+ strcpy(root->release_agent_path, opts.release_agent);
+ spin_unlock(&release_agent_path_lock);
+ }
+
+ trace_cgroup_remount(root);
+
+ out_unlock:
+ kfree(opts.release_agent);
+ kfree(opts.name);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
+struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
+ .rename = cgroup1_rename,
+ .show_options = cgroup1_show_options,
+ .remount_fs = cgroup1_remount,
+ .mkdir = cgroup_mkdir,
+ .rmdir = cgroup_rmdir,
+ .show_path = cgroup_show_path,
+};
+
+struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
+ void *data, unsigned long magic,
+ struct cgroup_namespace *ns)
+{
+ struct super_block *pinned_sb = NULL;
+ struct cgroup_sb_opts opts;
+ struct cgroup_root *root;
+ struct cgroup_subsys *ss;
+ struct dentry *dentry;
+ int i, ret;
+
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+ /* First find the desired set of subsystems */
+ ret = parse_cgroupfs_options(data, &opts);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Destruction of cgroup root is asynchronous, so subsystems may
+ * still be dying after the previous unmount. Let's drain the
+ * dying subsystems. We just need to ensure that the ones
+ * unmounted previously finish dying and don't care about new ones
+ * starting. Testing ref liveliness is good enough.
+ */
+ for_each_subsys(ss, i) {
+ if (!(opts.subsys_mask & (1 << i)) ||
+ ss->root == &cgrp_dfl_root)
+ continue;
+
+ if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
+ cgroup_put(&ss->root->cgrp);
+ }
+
+ for_each_root(root) {
+ bool name_match = false;
+
+ if (root == &cgrp_dfl_root)
+ continue;
+
+ /*
+ * If we asked for a name then it must match. Also, if
+ * name matches but sybsys_mask doesn't, we should fail.
+ * Remember whether name matched.
+ */
+ if (opts.name) {
+ if (strcmp(opts.name, root->name))
+ continue;
+ name_match = true;
+ }
+
+ /*
+ * If we asked for subsystems (or explicitly for no
+ * subsystems) then they must match.
+ */
+ if ((opts.subsys_mask || opts.none) &&
+ (opts.subsys_mask != root->subsys_mask)) {
+ if (!name_match)
+ continue;
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ if (root->flags ^ opts.flags)
+ pr_warn("new mount options do not match the existing superblock, will be ignored\n");
+
+ /*
+ * We want to reuse @root whose lifetime is governed by its
+ * ->cgrp. Let's check whether @root is alive and keep it
+ * that way. As cgroup_kill_sb() can happen anytime, we
+ * want to block it by pinning the sb so that @root doesn't
+ * get killed before mount is complete.
+ *
+ * With the sb pinned, tryget_live can reliably indicate
+ * whether @root can be reused. If it's being killed,
+ * drain it. We can use wait_queue for the wait but this
+ * path is super cold. Let's just sleep a bit and retry.
+ */
+ pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+ if (IS_ERR(pinned_sb) ||
+ !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ if (!IS_ERR_OR_NULL(pinned_sb))
+ deactivate_super(pinned_sb);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
+
+ ret = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * No such thing, create a new one. name= matching without subsys
+ * specification is allowed for already existing hierarchies but we
+ * can't create new one without subsys specification.
+ */
+ if (!opts.subsys_mask && !opts.none) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* Hierarchies may only be created in the initial cgroup namespace. */
+ if (ns != &init_cgroup_ns) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ root = kzalloc(sizeof(*root), GFP_KERNEL);
+ if (!root) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ init_cgroup_root(root, &opts);
+
+ ret = cgroup_setup_root(root, opts.subsys_mask);
+ if (ret)
+ cgroup_free_root(root);
+
+out_unlock:
+ mutex_unlock(&cgroup_mutex);
+out_free:
+ kfree(opts.release_agent);
+ kfree(opts.name);
+
+ if (ret)
+ return ERR_PTR(ret);
+
+ dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
+ CGROUP_SUPER_MAGIC, ns);
+
+ /*
+ * If @pinned_sb, we're reusing an existing root and holding an
+ * extra ref on its sb. Mount is complete. Put the extra ref.
+ */
+ if (pinned_sb)
+ deactivate_super(pinned_sb);
+
+ return dentry;
+}
+
+static int __init cgroup1_wq_init(void)
+{
+ /*
+ * Used to destroy pidlists and separate to serve as flush domain.
+ * Cap @max_active to 1 too.
+ */
+ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+ 0, 1);
+ BUG_ON(!cgroup_pidlist_destroy_wq);
+ return 0;
+}
+core_initcall(cgroup1_wq_init);
+
+static int __init cgroup_no_v1(char *str)
+{
+ struct cgroup_subsys *ss;
+ char *token;
+ int i;
+
+ while ((token = strsep(&str, ",")) != NULL) {
+ if (!*token)
+ continue;
+
+ if (!strcmp(token, "all")) {
+ cgroup_no_v1_mask = U16_MAX;
+ break;
+ }
+
+ for_each_subsys(ss, i) {
+ if (strcmp(token, ss->name) &&
+ strcmp(token, ss->legacy_name))
+ continue;
+
+ cgroup_no_v1_mask |= 1 << i;
+ }
+ }
+ return 1;
+}
+__setup("cgroup_no_v1=", cgroup_no_v1);
+
+
+#ifdef CONFIG_CGROUP_DEBUG
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+ if (!css)
+ return ERR_PTR(-ENOMEM);
+
+ return css;
+}
+
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css);
+}
+
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return cgroup_task_count(css->cgroup);
+}
+
+static u64 current_css_set_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return (u64)(unsigned long)current->cgroups;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 count;
+
+ rcu_read_lock();
+ count = atomic_read(&task_css_set(current)->refcount);
+ rcu_read_unlock();
+ return count;
+}
+
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+ char *name_buf;
+
+ name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!name_buf)
+ return -ENOMEM;
+
+ spin_lock_irq(&css_set_lock);
+ rcu_read_lock();
+ cset = rcu_dereference(current->cgroups);
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+ struct cgroup *c = link->cgrp;
+
+ cgroup_name(c, name_buf, NAME_MAX + 1);
+ seq_printf(seq, "Root %d group %s\n",
+ c->root->hierarchy_id, name_buf);
+ }
+ rcu_read_unlock();
+ spin_unlock_irq(&css_set_lock);
+ kfree(name_buf);
+ return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(seq);
+ struct cgrp_cset_link *link;
+
+ spin_lock_irq(&css_set_lock);
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+ struct css_set *cset = link->cset;
+ struct task_struct *task;
+ int count = 0;
+
+ seq_printf(seq, "css_set %p\n", cset);
+
+ list_for_each_entry(task, &cset->tasks, cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+ goto overflow;
+ seq_printf(seq, " task %d\n", task_pid_vnr(task));
+ }
+
+ list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+ goto overflow;
+ seq_printf(seq, " task %d\n", task_pid_vnr(task));
+ }
+ continue;
+ overflow:
+ seq_puts(seq, " ...\n");
+ }
+ spin_unlock_irq(&css_set_lock);
+ return 0;
+}
+
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return (!cgroup_is_populated(css->cgroup) &&
+ !css_has_online_children(&css->cgroup->self));
+}
+
+static struct cftype debug_files[] = {
+ {
+ .name = "taskcount",
+ .read_u64 = debug_taskcount_read,
+ },
+
+ {
+ .name = "current_css_set",
+ .read_u64 = current_css_set_read,
+ },
+
+ {
+ .name = "current_css_set_refcount",
+ .read_u64 = current_css_set_refcount_read,
+ },
+
+ {
+ .name = "current_css_set_cg_links",
+ .seq_show = current_css_set_cg_links_read,
+ },
+
+ {
+ .name = "cgroup_css_links",
+ .seq_show = cgroup_css_links_read,
+ },
+
+ {
+ .name = "releasable",
+ .read_u64 = releasable_read,
+ },
+
+ { } /* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+ .css_alloc = debug_css_alloc,
+ .css_free = debug_css_free,
+ .legacy_cftypes = debug_files,
+};
+#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup.c b/kernel/cgroup/cgroup.c
index 53bbca7c4859..e8f87bf9840c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -28,15 +28,13 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/cgroup.h>
+#include "cgroup-internal.h"
+
#include <linux/cred.h>
-#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/init_task.h>
#include <linux/kernel.h>
-#include <linux/list.h>
#include <linux/magic.h>
-#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
@@ -47,16 +45,9 @@
#include <linux/spinlock.h>
#include <linux/percpu-rwsem.h>
#include <linux/string.h>
-#include <linux/sort.h>
-#include <linux/kmod.h>
-#include <linux/delayacct.h>
-#include <linux/cgroupstats.h>
#include <linux/hashtable.h>
-#include <linux/pid_namespace.h>
#include <linux/idr.h>
-#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/kthread.h>
-#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/cpuset.h>
#include <linux/proc_ns.h>
@@ -67,14 +58,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/cgroup.h>
-/*
- * pidlists linger the following amount before being destroyed. The goal
- * is avoiding frequent destruction in the middle of consecutive read calls
- * Expiring in the middle is a performance problem not a correctness one.
- * 1 sec should be enough.
- */
-#define CGROUP_PIDLIST_DESTROY_DELAY HZ
-
#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
MAX_CFTYPE_NAME + 2)
@@ -88,14 +71,12 @@
* These locks are exported if CONFIG_PROVE_RCU so that accessors in
* cgroup.h can use them for lockdep annotations.
*/
-#ifdef CONFIG_PROVE_RCU
DEFINE_MUTEX(cgroup_mutex);
DEFINE_SPINLOCK(css_set_lock);
+
+#ifdef CONFIG_PROVE_RCU
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
-#else
-static DEFINE_MUTEX(cgroup_mutex);
-static DEFINE_SPINLOCK(css_set_lock);
#endif
/*
@@ -110,12 +91,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
*/
static DEFINE_SPINLOCK(cgroup_file_kn_lock);
-/*
- * Protects cgroup_subsys->release_agent_path. Modifying it also requires
- * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
- */
-static DEFINE_SPINLOCK(release_agent_path_lock);
-
struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
#define cgroup_assert_mutex_or_rcu_locked() \
@@ -131,15 +106,9 @@ struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
*/
static struct workqueue_struct *cgroup_destroy_wq;
-/*
- * pidlist destructions need to be flushed on cgroup destruction. Use a
- * separate workqueue as flush domain.
- */
-static struct workqueue_struct *cgroup_pidlist_destroy_wq;
-
/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
-static struct cgroup_subsys *cgroup_subsys[] = {
+struct cgroup_subsys *cgroup_subsys[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
@@ -186,18 +155,14 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
*/
static bool cgrp_dfl_visible;
-/* Controllers blocked by the commandline in v1 */
-static u16 cgroup_no_v1_mask;
-
/* some controllers are not supported in the default hierarchy */
static u16 cgrp_dfl_inhibit_ss_mask;
/* some controllers are implicitly enabled on the default hierarchy */
-static unsigned long cgrp_dfl_implicit_ss_mask;
+static u16 cgrp_dfl_implicit_ss_mask;
/* The list of hierarchy roots */
-
-static LIST_HEAD(cgroup_roots);
+LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
@@ -213,13 +178,13 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
static u64 css_serial_nr_next = 1;
/*
- * These bitmask flags indicate whether tasks in the fork and exit paths have
- * fork/exit handlers to call. This avoids us having to do extra work in the
- * fork/exit path to check which subsystems have fork/exit callbacks.
+ * These bitmasks identify subsystems with specific features to avoid
+ * having to do iterative checks repeatedly.
*/
static u16 have_fork_callback __read_mostly;
static u16 have_exit_callback __read_mostly;
static u16 have_free_callback __read_mostly;
+static u16 have_canfork_callback __read_mostly;
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
@@ -230,15 +195,9 @@ struct cgroup_namespace init_cgroup_ns = {
.root_cset = &init_css_set,
};
-/* Ditto for the can_fork callback. */
-static u16 have_canfork_callback __read_mostly;
-
static struct file_system_type cgroup2_fs_type;
-static struct cftype cgroup_dfl_base_files[];
-static struct cftype cgroup_legacy_base_files[];
+static struct cftype cgroup_base_files[];
-static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
-static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_advance(struct css_task_iter *it);
@@ -259,7 +218,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
* is fine for individual subsystems but unsuitable for cgroup core. This
* is slower static_key_enabled() based test indexed by @ssid.
*/
-static bool cgroup_ssid_enabled(int ssid)
+bool cgroup_ssid_enabled(int ssid)
{
if (CGROUP_SUBSYS_COUNT == 0)
return false;
@@ -267,11 +226,6 @@ static bool cgroup_ssid_enabled(int ssid)
return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}
-static bool cgroup_ssid_no_v1(int ssid)
-{
- return cgroup_no_v1_mask & (1 << ssid);
-}
-
/**
* cgroup_on_dfl - test whether a cgroup is on the default hierarchy
* @cgrp: the cgroup of interest
@@ -325,7 +279,7 @@ static bool cgroup_ssid_no_v1(int ssid)
*
* - debug: disallowed on the default hierarchy.
*/
-static bool cgroup_on_dfl(const struct cgroup *cgrp)
+bool cgroup_on_dfl(const struct cgroup *cgrp)
{
return cgrp->root == &cgrp_dfl_root;
}
@@ -481,12 +435,6 @@ out_unlock:
return css;
}
-/* convenient tests for these bits */
-static inline bool cgroup_is_dead(const struct cgroup *cgrp)
-{
- return !(cgrp->self.flags & CSS_ONLINE);
-}
-
static void cgroup_get(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
@@ -518,11 +466,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
}
EXPORT_SYMBOL_GPL(of_css);
-static int notify_on_release(const struct cgroup *cgrp)
-{
- return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-}
-
/**
* for_each_css - iterate all css's of a cgroup
* @css: the iteration cursor
@@ -553,15 +496,6 @@ static int notify_on_release(const struct cgroup *cgrp)
else
/**
- * for_each_subsys - iterate all enabled cgroup subsystems
- * @ss: the iteration cursor
- * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- */
-#define for_each_subsys(ss, ssid) \
- for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
- (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
-
-/**
* do_each_subsys_mask - filter for_each_subsys with a bitmask
* @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -585,10 +519,6 @@ static int notify_on_release(const struct cgroup *cgrp)
} \
} while (false)
-/* iterate across the hierarchies */
-#define for_each_root(root) \
- list_for_each_entry((root), &cgroup_roots, root_list)
-
/* iterate over child cgrps, lock should be held throughout iteration */
#define cgroup_for_each_live_child(child, cgrp) \
list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
@@ -615,29 +545,6 @@ static int notify_on_release(const struct cgroup *cgrp)
; \
else
-static void cgroup_release_agent(struct work_struct *work);
-static void check_for_release(struct cgroup *cgrp);
-
-/*
- * A cgroup can be associated with multiple css_sets as different tasks may
- * belong to different cgroups on different hierarchies. In the other
- * direction, a css_set is naturally associated with multiple cgroups.
- * This M:N relationship is represented by the following link structure
- * which exists for each association and allows traversing the associations
- * from both sides.
- */
-struct cgrp_cset_link {
- /* the cgroup and css_set this link associates */
- struct cgroup *cgrp;
- struct css_set *cset;
-
- /* list of cgrp_cset_links anchored at cgrp->cset_links */
- struct list_head cset_link;
-
- /* list of cgrp_cset_links anchored at css_set->cgrp_links */
- struct list_head cgrp_link;
-};
-
/*
* The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
@@ -647,12 +554,12 @@ struct cgrp_cset_link {
*/
struct css_set init_css_set = {
.refcount = ATOMIC_INIT(1),
- .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
+ .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
+ .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
- .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
};
static int css_set_count = 1; /* 1 for init_css_set */
@@ -699,7 +606,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
if (!trigger)
break;
- check_for_release(cgrp);
+ cgroup1_check_for_release(cgrp);
cgroup_file_notify(&cgrp->events_file);
cgrp = cgroup_parent(cgrp);
@@ -808,7 +715,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
return key;
}
-static void put_css_set_locked(struct css_set *cset)
+void put_css_set_locked(struct css_set *cset)
{
struct cgrp_cset_link *link, *tmp_link;
struct cgroup_subsys *ss;
@@ -838,31 +745,6 @@ static void put_css_set_locked(struct css_set *cset)
kfree_rcu(cset, rcu_head);
}
-static void put_css_set(struct css_set *cset)
-{
- unsigned long flags;
-
- /*
- * Ensure that the refcount doesn't hit zero while any readers
- * can see it. Similar to atomic_dec_and_lock(), but for an
- * rwlock
- */
- if (atomic_add_unless(&cset->refcount, -1, 1))
- return;
-
- spin_lock_irqsave(&css_set_lock, flags);
- put_css_set_locked(cset);
- spin_unlock_irqrestore(&css_set_lock, flags);
-}
-
-/*
- * refcounted get/put for css_set objects
- */
-static inline void get_css_set(struct css_set *cset)
-{
- atomic_inc(&cset->refcount);
-}
-
/**
* compare_css_sets - helper function for find_existing_css_set().
* @cset: candidate css_set being tested
@@ -1095,13 +977,13 @@ static struct css_set *find_css_set(struct css_set *old_cset,
}
atomic_set(&cset->refcount, 1);
- INIT_LIST_HEAD(&cset->cgrp_links);
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
- INIT_LIST_HEAD(&cset->mg_preload_node);
- INIT_LIST_HEAD(&cset->mg_node);
INIT_LIST_HEAD(&cset->task_iters);
INIT_HLIST_NODE(&cset->hlist);
+ INIT_LIST_HEAD(&cset->cgrp_links);
+ INIT_LIST_HEAD(&cset->mg_preload_node);
+ INIT_LIST_HEAD(&cset->mg_node);
/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */
@@ -1138,7 +1020,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
return cset;
}
-static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
+struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
struct cgroup *root_cgrp = kf_root->kn->priv;
@@ -1166,7 +1048,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
}
-static void cgroup_free_root(struct cgroup_root *root)
+void cgroup_free_root(struct cgroup_root *root)
{
if (root) {
idr_destroy(&root->cgroup_idr);
@@ -1283,8 +1165,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
* Return the cgroup for "task" from the given hierarchy. Must be
* called with cgroup_mutex and css_set_lock held.
*/
-static struct cgroup *task_cgroup_from_root(struct task_struct *task,
- struct cgroup_root *root)
+struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ struct cgroup_root *root)
{
/*
* No need to lock the task - since we hold cgroup_mutex the
@@ -1321,7 +1203,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
*/
static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
-static const struct file_operations proc_cgroupstats_operations;
static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
char *buf)
@@ -1415,7 +1296,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
* inaccessible any time. If the caller intends to continue to access the
* cgroup, it should pin it before invoking this function.
*/
-static void cgroup_kn_unlock(struct kernfs_node *kn)
+void cgroup_kn_unlock(struct kernfs_node *kn)
{
struct cgroup *cgrp;
@@ -1447,8 +1328,7 @@ static void cgroup_kn_unlock(struct kernfs_node *kn)
* locking under kernfs active protection and allows all kernfs operations
* including self-removal.
*/
-static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
- bool drain_offline)
+struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
{
struct cgroup *cgrp;
@@ -1532,9 +1412,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
if (!css->ss) {
if (cgroup_on_dfl(cgrp))
- cfts = cgroup_dfl_base_files;
+ cfts = cgroup_base_files;
else
- cfts = cgroup_legacy_base_files;
+ cfts = cgroup1_base_files;
return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
}
@@ -1559,7 +1439,7 @@ err:
return ret;
}
-static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
+int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
@@ -1629,8 +1509,8 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
return 0;
}
-static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
- struct kernfs_root *kf_root)
+int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+ struct kernfs_root *kf_root)
{
int len = 0;
char *buf = NULL;
@@ -1656,237 +1536,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
return len;
}
-static int cgroup_show_options(struct seq_file *seq,
- struct kernfs_root *kf_root)
-{
- struct cgroup_root *root = cgroup_root_from_kf(kf_root);
- struct cgroup_subsys *ss;
- int ssid;
-
- if (root != &cgrp_dfl_root)
- for_each_subsys(ss, ssid)
- if (root->subsys_mask & (1 << ssid))
- seq_show_option(seq, ss->legacy_name, NULL);
- if (root->flags & CGRP_ROOT_NOPREFIX)
- seq_puts(seq, ",noprefix");
- if (root->flags & CGRP_ROOT_XATTR)
- seq_puts(seq, ",xattr");
-
- spin_lock(&release_agent_path_lock);
- if (strlen(root->release_agent_path))
- seq_show_option(seq, "release_agent",
- root->release_agent_path);
- spin_unlock(&release_agent_path_lock);
-
- if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
- seq_puts(seq, ",clone_children");
- if (strlen(root->name))
- seq_show_option(seq, "name", root->name);
- return 0;
-}
-
-struct cgroup_sb_opts {
- u16 subsys_mask;
- unsigned int flags;
- char *release_agent;
- bool cpuset_clone_children;
- char *name;
- /* User explicitly requested empty subsystem */
- bool none;
-};
-
-static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
-{
- char *token, *o = data;
- bool all_ss = false, one_ss = false;
- u16 mask = U16_MAX;
- struct cgroup_subsys *ss;
- int nr_opts = 0;
- int i;
-
-#ifdef CONFIG_CPUSETS
- mask = ~((u16)1 << cpuset_cgrp_id);
-#endif
-
- memset(opts, 0, sizeof(*opts));
-
- while ((token = strsep(&o, ",")) != NULL) {
- nr_opts++;
-
- if (!*token)
- return -EINVAL;
- if (!strcmp(token, "none")) {
- /* Explicitly have no subsystems */
- opts->none = true;
- continue;
- }
- if (!strcmp(token, "all")) {
- /* Mutually exclusive option 'all' + subsystem name */
- if (one_ss)
- return -EINVAL;
- all_ss = true;
- continue;
- }
- if (!strcmp(token, "noprefix")) {
- opts->flags |= CGRP_ROOT_NOPREFIX;
- continue;
- }
- if (!strcmp(token, "clone_children")) {
- opts->cpuset_clone_children = true;
- continue;
- }
- if (!strcmp(token, "xattr")) {
- opts->flags |= CGRP_ROOT_XATTR;
- continue;
- }
- if (!strncmp(token, "release_agent=", 14)) {
- /* Specifying two release agents is forbidden */
- if (opts->release_agent)
- return -EINVAL;
- opts->release_agent =
- kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
- if (!opts->release_agent)
- return -ENOMEM;
- continue;
- }
- if (!strncmp(token, "name=", 5)) {
- const char *name = token + 5;
- /* Can't specify an empty name */
- if (!strlen(name))
- return -EINVAL;
- /* Must match [\w.-]+ */
- for (i = 0; i < strlen(name); i++) {
- char c = name[i];
- if (isalnum(c))
- continue;
- if ((c == '.') || (c == '-') || (c == '_'))
- continue;
- return -EINVAL;
- }
- /* Specifying two names is forbidden */
- if (opts->name)
- return -EINVAL;
- opts->name = kstrndup(name,
- MAX_CGROUP_ROOT_NAMELEN - 1,
- GFP_KERNEL);
- if (!opts->name)
- return -ENOMEM;
-
- continue;
- }
-
- for_each_subsys(ss, i) {
- if (strcmp(token, ss->legacy_name))
- continue;
- if (!cgroup_ssid_enabled(i))
- continue;
- if (cgroup_ssid_no_v1(i))
- continue;
-
- /* Mutually exclusive option 'all' + subsystem name */
- if (all_ss)
- return -EINVAL;
- opts->subsys_mask |= (1 << i);
- one_ss = true;
-
- break;
- }
- if (i == CGROUP_SUBSYS_COUNT)
- return -ENOENT;
- }
-
- /*
- * If the 'all' option was specified select all the subsystems,
- * otherwise if 'none', 'name=' and a subsystem name options were
- * not specified, let's default to 'all'
- */
- if (all_ss || (!one_ss && !opts->none && !opts->name))
- for_each_subsys(ss, i)
- if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
- opts->subsys_mask |= (1 << i);
-
- /*
- * We either have to specify by name or by subsystems. (So all
- * empty hierarchies must have a name).
- */
- if (!opts->subsys_mask && !opts->name)
- return -EINVAL;
-
- /*
- * Option noprefix was introduced just for backward compatibility
- * with the old cpuset, so we allow noprefix only if mounting just
- * the cpuset subsystem.
- */
- if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
- return -EINVAL;
-
- /* Can't specify "none" and some subsystems */
- if (opts->subsys_mask && opts->none)
- return -EINVAL;
-
- return 0;
-}
-
static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
{
- int ret = 0;
- struct cgroup_root *root = cgroup_root_from_kf(kf_root);
- struct cgroup_sb_opts opts;
- u16 added_mask, removed_mask;
-
- if (root == &cgrp_dfl_root) {
- pr_err("remount is not allowed\n");
- return -EINVAL;
- }
-
- cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
-
- /* See what subsystems are wanted */
- ret = parse_cgroupfs_options(data, &opts);
- if (ret)
- goto out_unlock;
-
- if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
- pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
- task_tgid_nr(current), current->comm);
-
- added_mask = opts.subsys_mask & ~root->subsys_mask;
- removed_mask = root->subsys_mask & ~opts.subsys_mask;
-
- /* Don't allow flags or name to change at remount */
- if ((opts.flags ^ root->flags) ||
- (opts.name && strcmp(opts.name, root->name))) {
- pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
- opts.flags, opts.name ?: "", root->flags, root->name);
- ret = -EINVAL;
- goto out_unlock;
- }
-
- /* remounting is not allowed for populated hierarchies */
- if (!list_empty(&root->cgrp.self.children)) {
- ret = -EBUSY;
- goto out_unlock;
- }
-
- ret = rebind_subsystems(root, added_mask);
- if (ret)
- goto out_unlock;
-
- WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
-
- if (opts.release_agent) {
- spin_lock(&release_agent_path_lock);
- strcpy(root->release_agent_path, opts.release_agent);
- spin_unlock(&release_agent_path_lock);
- }
-
- trace_cgroup_remount(root);
-
- out_unlock:
- kfree(opts.release_agent);
- kfree(opts.name);
- mutex_unlock(&cgroup_mutex);
- return ret;
+ pr_err("remount is not allowed\n");
+ return -EINVAL;
}
/*
@@ -1964,11 +1617,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
init_waitqueue_head(&cgrp->offline_waitq);
- INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
+ INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
-static void init_cgroup_root(struct cgroup_root *root,
- struct cgroup_sb_opts *opts)
+void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
{
struct cgroup *cgrp = &root->cgrp;
@@ -1987,10 +1639,11 @@ static void init_cgroup_root(struct cgroup_root *root,
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
+ struct kernfs_syscall_ops *kf_sops;
struct css_set *cset;
int i, ret;
@@ -2022,7 +1675,10 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
if (ret)
goto cancel_ref;
- root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
+ kf_sops = root == &cgrp_dfl_root ?
+ &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
+
+ root->kf_root = kernfs_create_root(kf_sops,
KERNFS_ROOT_CREATE_DEACTIVATED,
root_cgrp);
if (IS_ERR(root->kf_root)) {
@@ -2080,182 +1736,18 @@ out:
return ret;
}
-static struct dentry *cgroup_mount(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name,
- void *data)
+struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
+ struct cgroup_root *root, unsigned long magic,
+ struct cgroup_namespace *ns)
{
- bool is_v2 = fs_type == &cgroup2_fs_type;
- struct super_block *pinned_sb = NULL;
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
- struct cgroup_subsys *ss;
- struct cgroup_root *root;
- struct cgroup_sb_opts opts;
struct dentry *dentry;
- int ret;
- int i;
bool new_sb;
- get_cgroup_ns(ns);
-
- /* Check if the caller has permission to mount. */
- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
- put_cgroup_ns(ns);
- return ERR_PTR(-EPERM);
- }
-
- /*
- * The first time anyone tries to mount a cgroup, enable the list
- * linking each css_set to its tasks and fix up all existing tasks.
- */
- if (!use_task_css_set_links)
- cgroup_enable_task_cg_lists();
-
- if (is_v2) {
- if (data) {
- pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
- put_cgroup_ns(ns);
- return ERR_PTR(-EINVAL);
- }
- cgrp_dfl_visible = true;
- root = &cgrp_dfl_root;
- cgroup_get(&root->cgrp);
- goto out_mount;
- }
-
- cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
-
- /* First find the desired set of subsystems */
- ret = parse_cgroupfs_options(data, &opts);
- if (ret)
- goto out_unlock;
-
- /*
- * Destruction of cgroup root is asynchronous, so subsystems may
- * still be dying after the previous unmount. Let's drain the
- * dying subsystems. We just need to ensure that the ones
- * unmounted previously finish dying and don't care about new ones
- * starting. Testing ref liveliness is good enough.
- */
- for_each_subsys(ss, i) {
- if (!(opts.subsys_mask & (1 << i)) ||
- ss->root == &cgrp_dfl_root)
- continue;
-
- if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
- mutex_unlock(&cgroup_mutex);
- msleep(10);
- ret = restart_syscall();
- goto out_free;
- }
- cgroup_put(&ss->root->cgrp);
- }
-
- for_each_root(root) {
- bool name_match = false;
-
- if (root == &cgrp_dfl_root)
- continue;
-
- /*
- * If we asked for a name then it must match. Also, if
- * name matches but sybsys_mask doesn't, we should fail.
- * Remember whether name matched.
- */
- if (opts.name) {
- if (strcmp(opts.name, root->name))
- continue;
- name_match = true;
- }
-
- /*
- * If we asked for subsystems (or explicitly for no
- * subsystems) then they must match.
- */
- if ((opts.subsys_mask || opts.none) &&
- (opts.subsys_mask != root->subsys_mask)) {
- if (!name_match)
- continue;
- ret = -EBUSY;
- goto out_unlock;
- }
-
- if (root->flags ^ opts.flags)
- pr_warn("new mount options do not match the existing superblock, will be ignored\n");
-
- /*
- * We want to reuse @root whose lifetime is governed by its
- * ->cgrp. Let's check whether @root is alive and keep it
- * that way. As cgroup_kill_sb() can happen anytime, we
- * want to block it by pinning the sb so that @root doesn't
- * get killed before mount is complete.
- *
- * With the sb pinned, tryget_live can reliably indicate
- * whether @root can be reused. If it's being killed,
- * drain it. We can use wait_queue for the wait but this
- * path is super cold. Let's just sleep a bit and retry.
- */
- pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
- if (IS_ERR(pinned_sb) ||
- !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
- mutex_unlock(&cgroup_mutex);
- if (!IS_ERR_OR_NULL(pinned_sb))
- deactivate_super(pinned_sb);
- msleep(10);
- ret = restart_syscall();
- goto out_free;
- }
-
- ret = 0;
- goto out_unlock;
- }
+ dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
/*
- * No such thing, create a new one. name= matching without subsys
- * specification is allowed for already existing hierarchies but we
- * can't create new one without subsys specification.
- */
- if (!opts.subsys_mask && !opts.none) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
- /* Hierarchies may only be created in the initial cgroup namespace. */
- if (ns != &init_cgroup_ns) {
- ret = -EPERM;
- goto out_unlock;
- }
-
- root = kzalloc(sizeof(*root), GFP_KERNEL);
- if (!root) {
- ret = -ENOMEM;
- goto out_unlock;
- }
-
- init_cgroup_root(root, &opts);
-
- ret = cgroup_setup_root(root, opts.subsys_mask);
- if (ret)
- cgroup_free_root(root);
-
-out_unlock:
- mutex_unlock(&cgroup_mutex);
-out_free:
- kfree(opts.release_agent);
- kfree(opts.name);
-
- if (ret) {
- put_cgroup_ns(ns);
- return ERR_PTR(ret);
- }
-out_mount:
- dentry = kernfs_mount(fs_type, flags, root->kf_root,
- is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
- &new_sb);
-
- /*
- * In non-init cgroup namespace, instead of root cgroup's
- * dentry, we return the dentry corresponding to the
- * cgroupns->root_cgrp.
+ * In non-init cgroup namespace, instead of root cgroup's dentry,
+ * we return the dentry corresponding to the cgroupns->root_cgrp.
*/
if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
struct dentry *nsdentry;
@@ -2277,13 +1769,45 @@ out_mount:
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
+ return dentry;
+}
+
+static struct dentry *cgroup_mount(struct file_system_type *fs_type,
+ int flags, const char *unused_dev_name,
+ void *data)
+{
+ struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct dentry *dentry;
+
+ get_cgroup_ns(ns);
+
+ /* Check if the caller has permission to mount. */
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
+ put_cgroup_ns(ns);
+ return ERR_PTR(-EPERM);
+ }
+
/*
- * If @pinned_sb, we're reusing an existing root and holding an
- * extra ref on its sb. Mount is complete. Put the extra ref.
+ * The first time anyone tries to mount a cgroup, enable the list
+ * linking each css_set to its tasks and fix up all existing tasks.
*/
- if (pinned_sb) {
- WARN_ON(new_sb);
- deactivate_super(pinned_sb);
+ if (!use_task_css_set_links)
+ cgroup_enable_task_cg_lists();
+
+ if (fs_type == &cgroup2_fs_type) {
+ if (data) {
+ pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+ put_cgroup_ns(ns);
+ return ERR_PTR(-EINVAL);
+ }
+ cgrp_dfl_visible = true;
+ cgroup_get(&cgrp_dfl_root.cgrp);
+
+ dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
+ CGROUP2_SUPER_MAGIC, ns);
+ } else {
+ dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
+ CGROUP_SUPER_MAGIC, ns);
}
put_cgroup_ns(ns);
@@ -2311,7 +1835,7 @@ static void cgroup_kill_sb(struct super_block *sb)
kernfs_kill_sb(sb);
}
-static struct file_system_type cgroup_fs_type = {
+struct file_system_type cgroup_fs_type = {
.name = "cgroup",
.mount = cgroup_mount,
.kill_sb = cgroup_kill_sb,
@@ -2325,8 +1849,8 @@ static struct file_system_type cgroup2_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
-static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
- struct cgroup_namespace *ns)
+int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+ struct cgroup_namespace *ns)
{
struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
@@ -2389,49 +1913,18 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
}
EXPORT_SYMBOL_GPL(task_cgroup_path);
-/* used to track tasks and other necessary states during migration */
-struct cgroup_taskset {
- /* the src and dst cset list running through cset->mg_node */
- struct list_head src_csets;
- struct list_head dst_csets;
-
- /* the subsys currently being processed */
- int ssid;
-
- /*
- * Fields for cgroup_taskset_*() iteration.
- *
- * Before migration is committed, the target migration tasks are on
- * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
- * the csets on ->dst_csets. ->csets point to either ->src_csets
- * or ->dst_csets depending on whether migration is committed.
- *
- * ->cur_csets and ->cur_task point to the current task position
- * during iteration.
- */
- struct list_head *csets;
- struct css_set *cur_cset;
- struct task_struct *cur_task;
-};
-
-#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \
- .src_csets = LIST_HEAD_INIT(tset.src_csets), \
- .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \
- .csets = &tset.src_csets, \
-}
-
/**
- * cgroup_taskset_add - try to add a migration target task to a taskset
+ * cgroup_migrate_add_task - add a migration target task to a migration context
* @task: target task
- * @tset: target taskset
+ * @mgctx: target migration context
*
- * Add @task, which is a migration target, to @tset. This function becomes
- * noop if @task doesn't need to be migrated. @task's css_set should have
- * been added as a migration source and @task->cg_list will be moved from
- * the css_set's tasks list to mg_tasks one.
+ * Add @task, which is a migration target, to @mgctx->tset. This function
+ * becomes noop if @task doesn't need to be migrated. @task's css_set
+ * should have been added as a migration source and @task->cg_list will be
+ * moved from the css_set's tasks list to mg_tasks one.
*/
-static void cgroup_taskset_add(struct task_struct *task,
- struct cgroup_taskset *tset)
+static void cgroup_migrate_add_task(struct task_struct *task,
+ struct cgroup_mgctx *mgctx)
{
struct css_set *cset;
@@ -2451,10 +1944,11 @@ static void cgroup_taskset_add(struct task_struct *task,
list_move_tail(&task->cg_list, &cset->mg_tasks);
if (list_empty(&cset->mg_node))
- list_add_tail(&cset->mg_node, &tset->src_csets);
+ list_add_tail(&cset->mg_node,
+ &mgctx->tset.src_csets);
if (list_empty(&cset->mg_dst_cset->mg_node))
- list_move_tail(&cset->mg_dst_cset->mg_node,
- &tset->dst_csets);
+ list_add_tail(&cset->mg_dst_cset->mg_node,
+ &mgctx->tset.dst_csets);
}
/**
@@ -2521,17 +2015,16 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
/**
* cgroup_taskset_migrate - migrate a taskset
- * @tset: taget taskset
- * @root: cgroup root the migration is taking place on
+ * @mgctx: migration context
*
- * Migrate tasks in @tset as setup by migration preparation functions.
+ * Migrate tasks in @mgctx as setup by migration preparation functions.
* This function fails iff one of the ->can_attach callbacks fails and
- * guarantees that either all or none of the tasks in @tset are migrated.
- * @tset is consumed regardless of success.
+ * guarantees that either all or none of the tasks in @mgctx are migrated.
+ * @mgctx is consumed regardless of success.
*/
-static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
- struct cgroup_root *root)
+static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
{
+ struct cgroup_taskset *tset = &mgctx->tset;
struct cgroup_subsys *ss;
struct task_struct *task, *tmp_task;
struct css_set *cset, *tmp_cset;
@@ -2542,7 +2035,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
return 0;
/* check that we can legitimately attach to the cgroup */
- do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
if (ss->can_attach) {
tset->ssid = ssid;
ret = ss->can_attach(tset);
@@ -2578,7 +2071,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
*/
tset->csets = &tset->dst_csets;
- do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
if (ss->attach) {
tset->ssid = ssid;
ss->attach(tset);
@@ -2589,7 +2082,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
goto out_release_tset;
out_cancel_attach:
- do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
if (ssid == failed_ssid)
break;
if (ss->cancel_attach) {
@@ -2616,7 +2109,7 @@ out_release_tset:
* zero for migration destination cgroups with tasks so that child cgroups
* don't compete against tasks.
*/
-static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
{
return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
!dst_cgrp->subtree_control;
@@ -2624,25 +2117,31 @@ static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
/**
* cgroup_migrate_finish - cleanup after attach
- * @preloaded_csets: list of preloaded css_sets
+ * @mgctx: migration context
*
* Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
* those functions for details.
*/
-static void cgroup_migrate_finish(struct list_head *preloaded_csets)
+void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
{
+ LIST_HEAD(preloaded);
struct css_set *cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
+
+ list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
+ list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
+
+ list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
cset->mg_src_cgrp = NULL;
cset->mg_dst_cgrp = NULL;
cset->mg_dst_cset = NULL;
list_del_init(&cset->mg_preload_node);
put_css_set_locked(cset);
}
+
spin_unlock_irq(&css_set_lock);
}
@@ -2650,10 +2149,10 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
* cgroup_migrate_add_src - add a migration source css_set
* @src_cset: the source css_set to add
* @dst_cgrp: the destination cgroup
- * @preloaded_csets: list of preloaded css_sets
+ * @mgctx: migration context
*
* Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
- * @src_cset and add it to @preloaded_csets, which should later be cleaned
+ * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
* up by cgroup_migrate_finish().
*
* This function may be called without holding cgroup_threadgroup_rwsem
@@ -2662,9 +2161,9 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
* into play and the preloaded css_sets are guaranteed to cover all
* migrations.
*/
-static void cgroup_migrate_add_src(struct css_set *src_cset,
- struct cgroup *dst_cgrp,
- struct list_head *preloaded_csets)
+void cgroup_migrate_add_src(struct css_set *src_cset,
+ struct cgroup *dst_cgrp,
+ struct cgroup_mgctx *mgctx)
{
struct cgroup *src_cgrp;
@@ -2692,33 +2191,35 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
src_cset->mg_src_cgrp = src_cgrp;
src_cset->mg_dst_cgrp = dst_cgrp;
get_css_set(src_cset);
- list_add(&src_cset->mg_preload_node, preloaded_csets);
+ list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
}
/**
* cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @preloaded_csets: list of preloaded source css_sets
+ * @mgctx: migration context
*
* Tasks are about to be moved and all the source css_sets have been
- * preloaded to @preloaded_csets. This function looks up and pins all
- * destination css_sets, links each to its source, and append them to
- * @preloaded_csets.
+ * preloaded to @mgctx->preloaded_src_csets. This function looks up and
+ * pins all destination css_sets, links each to its source, and append them
+ * to @mgctx->preloaded_dst_csets.
*
* This function must be called after cgroup_migrate_add_src() has been
* called on each migration source css_set. After migration is performed
* using cgroup_migrate(), cgroup_migrate_finish() must be called on
- * @preloaded_csets.
+ * @mgctx.
*/
-static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
+int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
{
- LIST_HEAD(csets);
struct css_set *src_cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
/* look up the dst cset for each src cset and link it to src */
- list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
+ list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
+ mg_preload_node) {
struct css_set *dst_cset;
+ struct cgroup_subsys *ss;
+ int ssid;
dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
if (!dst_cset)
@@ -2743,15 +2244,19 @@ static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
src_cset->mg_dst_cset = dst_cset;
if (list_empty(&dst_cset->mg_preload_node))
- list_add(&dst_cset->mg_preload_node, &csets);
+ list_add_tail(&dst_cset->mg_preload_node,
+ &mgctx->preloaded_dst_csets);
else
put_css_set(dst_cset);
+
+ for_each_subsys(ss, ssid)
+ if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
+ mgctx->ss_mask |= 1 << ssid;
}
- list_splice_tail(&csets, preloaded_csets);
return 0;
err:
- cgroup_migrate_finish(&csets);
+ cgroup_migrate_finish(mgctx);
return -ENOMEM;
}
@@ -2759,7 +2264,7 @@ err:
* cgroup_migrate - migrate a process or task to a cgroup
* @leader: the leader of the process or the task to migrate
* @threadgroup: whether @leader points to the whole process or a single task
- * @root: cgroup root migration is taking place on
+ * @mgctx: migration context
*
* Migrate a process or task denoted by @leader. If migrating a process,
* the caller must be holding cgroup_threadgroup_rwsem. The caller is also
@@ -2773,10 +2278,9 @@ err:
* decided for all targets by invoking group_migrate_prepare_dst() before
* actually starting migrating.
*/
-static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
- struct cgroup_root *root)
+int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+ struct cgroup_mgctx *mgctx)
{
- struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
struct task_struct *task;
/*
@@ -2788,14 +2292,14 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
rcu_read_lock();
task = leader;
do {
- cgroup_taskset_add(task, &tset);
+ cgroup_migrate_add_task(task, mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
- return cgroup_taskset_migrate(&tset, root);
+ return cgroup_migrate_execute(mgctx);
}
/**
@@ -2806,10 +2310,10 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
*
* Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
*/
-static int cgroup_attach_task(struct cgroup *dst_cgrp,
- struct task_struct *leader, bool threadgroup)
+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+ bool threadgroup)
{
- LIST_HEAD(preloaded_csets);
+ DEFINE_CGROUP_MGCTX(mgctx);
struct task_struct *task;
int ret;
@@ -2821,8 +2325,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
rcu_read_lock();
task = leader;
do {
- cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
- &preloaded_csets);
+ cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
@@ -2830,11 +2333,11 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
spin_unlock_irq(&css_set_lock);
/* prepare dst csets and commit */
- ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+ ret = cgroup_migrate_prepare_dst(&mgctx);
if (!ret)
- ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
+ ret = cgroup_migrate(leader, threadgroup, &mgctx);
- cgroup_migrate_finish(&preloaded_csets);
+ cgroup_migrate_finish(&mgctx);
if (!ret)
trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
@@ -2846,20 +2349,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
struct cgroup *dst_cgrp,
struct kernfs_open_file *of)
{
- const struct cred *cred = current_cred();
- const struct cred *tcred = get_task_cred(task);
int ret = 0;
- /*
- * even if we're attaching all tasks in the thread group, we only
- * need to check permissions on one of them.
- */
- if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
- !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid))
- ret = -EACCES;
-
- if (!ret && cgroup_on_dfl(dst_cgrp)) {
+ if (cgroup_on_dfl(dst_cgrp)) {
struct super_block *sb = of->file->f_path.dentry->d_sb;
struct cgroup *cgrp;
struct inode *inode;
@@ -2877,9 +2369,21 @@ static int cgroup_procs_write_permission(struct task_struct *task,
ret = inode_permission(inode, MAY_WRITE);
iput(inode);
}
+ } else {
+ const struct cred *cred = current_cred();
+ const struct cred *tcred = get_task_cred(task);
+
+ /*
+ * even if we're attaching all tasks in the thread group,
+ * we only need to check permissions on one of them.
+ */
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid))
+ ret = -EACCES;
+ put_cred(tcred);
}
- put_cred(tcred);
return ret;
}
@@ -2888,8 +2392,8 @@ static int cgroup_procs_write_permission(struct task_struct *task,
* function to attach either it or all tasks in its threadgroup. Will lock
* cgroup_mutex and threadgroup.
*/
-static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off, bool threadgroup)
+ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off, bool threadgroup)
{
struct task_struct *tsk;
struct cgroup_subsys *ss;
@@ -2950,86 +2454,12 @@ out_unlock_threadgroup:
return ret ?: nbytes;
}
-/**
- * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
- * @from: attach to all cgroups of a given task
- * @tsk: the task to be attached
- */
-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
-{
- struct cgroup_root *root;
- int retval = 0;
-
- mutex_lock(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
- for_each_root(root) {
- struct cgroup *from_cgrp;
-
- if (root == &cgrp_dfl_root)
- continue;
-
- spin_lock_irq(&css_set_lock);
- from_cgrp = task_cgroup_from_root(from, root);
- spin_unlock_irq(&css_set_lock);
-
- retval = cgroup_attach_task(from_cgrp, tsk, false);
- if (retval)
- break;
- }
- percpu_up_write(&cgroup_threadgroup_rwsem);
- mutex_unlock(&cgroup_mutex);
-
- return retval;
-}
-EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-
-static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- return __cgroup_procs_write(of, buf, nbytes, off, false);
-}
-
-static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+ loff_t off)
{
return __cgroup_procs_write(of, buf, nbytes, off, true);
}
-static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct cgroup *cgrp;
-
- BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-
- cgrp = cgroup_kn_lock_live(of->kn, false);
- if (!cgrp)
- return -ENODEV;
- spin_lock(&release_agent_path_lock);
- strlcpy(cgrp->root->release_agent_path, strstrip(buf),
- sizeof(cgrp->root->release_agent_path));
- spin_unlock(&release_agent_path_lock);
- cgroup_kn_unlock(of->kn);
- return nbytes;
-}
-
-static int cgroup_release_agent_show(struct seq_file *seq, void *v)
-{
- struct cgroup *cgrp = seq_css(seq)->cgroup;
-
- spin_lock(&release_agent_path_lock);
- seq_puts(seq, cgrp->root->release_agent_path);
- spin_unlock(&release_agent_path_lock);
- seq_putc(seq, '\n');
- return 0;
-}
-
-static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
-{
- seq_puts(seq, "0\n");
- return 0;
-}
-
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
{
struct cgroup_subsys *ss;
@@ -3075,8 +2505,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
*/
static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
- LIST_HEAD(preloaded_csets);
- struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
+ DEFINE_CGROUP_MGCTX(mgctx);
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset;
@@ -3092,33 +2521,28 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
struct cgrp_cset_link *link;
list_for_each_entry(link, &dsct->cset_links, cset_link)
- cgroup_migrate_add_src(link->cset, dsct,
- &preloaded_csets);
+ cgroup_migrate_add_src(link->cset, dsct, &mgctx);
}
spin_unlock_irq(&css_set_lock);
/* NULL dst indicates self on default hierarchy */
- ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+ ret = cgroup_migrate_prepare_dst(&mgctx);
if (ret)
goto out_finish;
spin_lock_irq(&css_set_lock);
- list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+ list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
struct task_struct *task, *ntask;
- /* src_csets precede dst_csets, break on the first dst_cset */
- if (!src_cset->mg_src_cgrp)
- break;
-
/* all tasks in src_csets need to be migrated */
list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
- cgroup_taskset_add(task, &tset);
+ cgroup_migrate_add_task(task, &mgctx);
}
spin_unlock_irq(&css_set_lock);
- ret = cgroup_taskset_migrate(&tset, cgrp->root);
+ ret = cgroup_migrate_execute(&mgctx);
out_finish:
- cgroup_migrate_finish(&preloaded_csets);
+ cgroup_migrate_finish(&mgctx);
percpu_up_write(&cgroup_threadgroup_rwsem);
return ret;
}
@@ -3131,7 +2555,7 @@ out_finish:
* controller while the previous css is still around. This function grabs
* cgroup_mutex and drains the previous css instances of @cgrp's subtree.
*/
-static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
+void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
__acquires(&cgroup_mutex)
{
struct cgroup *dsct;
@@ -3503,6 +2927,23 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
return 0;
}
+static int cgroup_file_open(struct kernfs_open_file *of)
+{
+ struct cftype *cft = of->kn->priv;
+
+ if (cft->open)
+ return cft->open(of);
+ return 0;
+}
+
+static void cgroup_file_release(struct kernfs_open_file *of)
+{
+ struct cftype *cft = of->kn->priv;
+
+ if (cft->release)
+ cft->release(of);
+}
+
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
@@ -3553,7 +2994,8 @@ static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
{
- seq_cft(seq)->seq_stop(seq, v);
+ if (seq_cft(seq)->seq_stop)
+ seq_cft(seq)->seq_stop(seq, v);
}
static int cgroup_seqfile_show(struct seq_file *m, void *arg)
@@ -3575,12 +3017,16 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
static struct kernfs_ops cgroup_kf_single_ops = {
.atomic_write_len = PAGE_SIZE,
+ .open = cgroup_file_open,
+ .release = cgroup_file_release,
.write = cgroup_file_write,
.seq_show = cgroup_seqfile_show,
};
static struct kernfs_ops cgroup_kf_ops = {
.atomic_write_len = PAGE_SIZE,
+ .open = cgroup_file_open,
+ .release = cgroup_file_release,
.write = cgroup_file_write,
.seq_start = cgroup_seqfile_start,
.seq_next = cgroup_seqfile_next,
@@ -3588,48 +3034,6 @@ static struct kernfs_ops cgroup_kf_ops = {
.seq_show = cgroup_seqfile_show,
};
-/*
- * cgroup_rename - Only allow simple rename of directories in place.
- */
-static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
- const char *new_name_str)
-{
- struct cgroup *cgrp = kn->priv;
- int ret;
-
- if (kernfs_type(kn) != KERNFS_DIR)
- return -ENOTDIR;
- if (kn->parent != new_parent)
- return -EIO;
-
- /*
- * This isn't a proper migration and its usefulness is very
- * limited. Disallow on the default hierarchy.
- */
- if (cgroup_on_dfl(cgrp))
- return -EPERM;
-
- /*
- * We're gonna grab cgroup_mutex which nests outside kernfs
- * active_ref. kernfs_rename() doesn't require active_ref
- * protection. Break them before grabbing cgroup_mutex.
- */
- kernfs_break_active_protection(new_parent);
- kernfs_break_active_protection(kn);
-
- mutex_lock(&cgroup_mutex);
-
- ret = kernfs_rename(kn, new_parent, new_name_str);
- if (!ret)
- trace_cgroup_rename(cgrp);
-
- mutex_unlock(&cgroup_mutex);
-
- kernfs_unbreak_active_protection(kn);
- kernfs_unbreak_active_protection(new_parent);
- return ret;
-}
-
/* set uid and gid of cgroup dirs and files to that of the creator */
static int cgroup_kn_set_ugid(struct kernfs_node *kn)
{
@@ -3926,26 +3330,6 @@ void cgroup_file_notify(struct cgroup_file *cfile)
}
/**
- * cgroup_task_count - count the number of tasks in a cgroup.
- * @cgrp: the cgroup in question
- *
- * Return the number of tasks in the cgroup. The returned number can be
- * higher than the actual number of tasks due to css_set references from
- * namespace roots and temporary usages.
- */
-static int cgroup_task_count(const struct cgroup *cgrp)
-{
- int count = 0;
- struct cgrp_cset_link *link;
-
- spin_lock_irq(&css_set_lock);
- list_for_each_entry(link, &cgrp->cset_links, cset_link)
- count += atomic_read(&link->cset->refcount);
- spin_unlock_irq(&css_set_lock);
- return count;
-}
-
-/**
* css_next_child - find the next child of a given css
* @pos: the current position (%NULL to initiate traversal)
* @parent: css whose children to walk
@@ -4343,560 +3727,69 @@ void css_task_iter_end(struct css_task_iter *it)
put_task_struct(it->cur_task);
}
-/**
- * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
- * @to: cgroup to which the tasks will be moved
- * @from: cgroup in which the tasks currently reside
- *
- * Locking rules between cgroup_post_fork() and the migration path
- * guarantee that, if a task is forking while being migrated, the new child
- * is guaranteed to be either visible in the source cgroup after the
- * parent's migration is complete or put into the target cgroup. No task
- * can slip out of migration through forking.
- */
-int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
-{
- LIST_HEAD(preloaded_csets);
- struct cgrp_cset_link *link;
- struct css_task_iter it;
- struct task_struct *task;
- int ret;
-
- if (!cgroup_may_migrate_to(to))
- return -EBUSY;
-
- mutex_lock(&cgroup_mutex);
-
- percpu_down_write(&cgroup_threadgroup_rwsem);
-
- /* all tasks in @from are being moved, all csets are source */
- spin_lock_irq(&css_set_lock);
- list_for_each_entry(link, &from->cset_links, cset_link)
- cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
- spin_unlock_irq(&css_set_lock);
-
- ret = cgroup_migrate_prepare_dst(&preloaded_csets);
- if (ret)
- goto out_err;
-
- /*
- * Migrate tasks one-by-one until @from is empty. This fails iff
- * ->can_attach() fails.
- */
- do {
- css_task_iter_start(&from->self, &it);
- task = css_task_iter_next(&it);
- if (task)
- get_task_struct(task);
- css_task_iter_end(&it);
-
- if (task) {
- ret = cgroup_migrate(task, false, to->root);
- if (!ret)
- trace_cgroup_transfer_tasks(to, task, false);
- put_task_struct(task);
- }
- } while (task && !ret);
-out_err:
- cgroup_migrate_finish(&preloaded_csets);
- percpu_up_write(&cgroup_threadgroup_rwsem);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
-/*
- * Stuff for reading the 'tasks'/'procs' files.
- *
- * Reading this file can return large amounts of data if a cgroup has
- * *lots* of attached tasks. So it may need several calls to read(),
- * but we cannot guarantee that the information we produce is correct
- * unless we produce it entirely atomically.
- *
- */
-
-/* which pidlist file are we talking about? */
-enum cgroup_filetype {
- CGROUP_FILE_PROCS,
- CGROUP_FILE_TASKS,
-};
-
-/*
- * A pidlist is a list of pids that virtually represents the contents of one
- * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
- * a pair (one each for procs, tasks) for each pid namespace that's relevant
- * to the cgroup.
- */
-struct cgroup_pidlist {
- /*
- * used to find which pidlist is wanted. doesn't change as long as
- * this particular list stays in the list.
- */
- struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
- /* array of xids */
- pid_t *list;
- /* how many elements the above list has */
- int length;
- /* each of these stored in a list by its cgroup */
- struct list_head links;
- /* pointer to the cgroup we belong to, for list removal purposes */
- struct cgroup *owner;
- /* for delayed destruction */
- struct delayed_work destroy_dwork;
-};
-
-/*
- * The following two functions "fix" the issue where there are more pids
- * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
- * TODO: replace with a kernel-wide solution to this problem
- */
-#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
-static void *pidlist_allocate(int count)
-{
- if (PIDLIST_TOO_LARGE(count))
- return vmalloc(count * sizeof(pid_t));
- else
- return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
-}
-
-static void pidlist_free(void *p)
-{
- kvfree(p);
-}
-
-/*
- * Used to destroy all pidlists lingering waiting for destroy timer. None
- * should be left afterwards.
- */
-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
-{
- struct cgroup_pidlist *l, *tmp_l;
-
- mutex_lock(&cgrp->pidlist_mutex);
- list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
- mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
- mutex_unlock(&cgrp->pidlist_mutex);
-
- flush_workqueue(cgroup_pidlist_destroy_wq);
- BUG_ON(!list_empty(&cgrp->pidlists));
-}
-
-static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
-{
- struct delayed_work *dwork = to_delayed_work(work);
- struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
- destroy_dwork);
- struct cgroup_pidlist *tofree = NULL;
-
- mutex_lock(&l->owner->pidlist_mutex);
-
- /*
- * Destroy iff we didn't get queued again. The state won't change
- * as destroy_dwork can only be queued while locked.
- */
- if (!delayed_work_pending(dwork)) {
- list_del(&l->links);
- pidlist_free(l->list);
- put_pid_ns(l->key.ns);
- tofree = l;
- }
-
- mutex_unlock(&l->owner->pidlist_mutex);
- kfree(tofree);
-}
-
-/*
- * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
- * Returns the number of unique elements.
- */
-static int pidlist_uniq(pid_t *list, int length)
-{
- int src, dest = 1;
-
- /*
- * we presume the 0th element is unique, so i starts at 1. trivial
- * edge cases first; no work needs to be done for either
- */
- if (length == 0 || length == 1)
- return length;
- /* src and dest walk down the list; dest counts unique elements */
- for (src = 1; src < length; src++) {
- /* find next unique element */
- while (list[src] == list[src-1]) {
- src++;
- if (src == length)
- goto after;
- }
- /* dest always points to where the next unique element goes */
- list[dest] = list[src];
- dest++;
- }
-after:
- return dest;
-}
-
-/*
- * The two pid files - task and cgroup.procs - guaranteed that the result
- * is sorted, which forced this whole pidlist fiasco. As pid order is
- * different per namespace, each namespace needs differently sorted list,
- * making it impossible to use, for example, single rbtree of member tasks
- * sorted by task pointer. As pidlists can be fairly large, allocating one
- * per open file is dangerous, so cgroup had to implement shared pool of
- * pidlists keyed by cgroup and namespace.
- *
- * All this extra complexity was caused by the original implementation
- * committing to an entirely unnecessary property. In the long term, we
- * want to do away with it. Explicitly scramble sort order if on the
- * default hierarchy so that no such expectation exists in the new
- * interface.
- *
- * Scrambling is done by swapping every two consecutive bits, which is
- * non-identity one-to-one mapping which disturbs sort order sufficiently.
- */
-static pid_t pid_fry(pid_t pid)
+static void cgroup_procs_release(struct kernfs_open_file *of)
{
- unsigned a = pid & 0x55555555;
- unsigned b = pid & 0xAAAAAAAA;
-
- return (a << 1) | (b >> 1);
-}
-
-static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
-{
- if (cgroup_on_dfl(cgrp))
- return pid_fry(pid);
- else
- return pid;
-}
-
-static int cmppid(const void *a, const void *b)
-{
- return *(pid_t *)a - *(pid_t *)b;
-}
-
-static int fried_cmppid(const void *a, const void *b)
-{
- return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
-}
-
-static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
- enum cgroup_filetype type)
-{
- struct cgroup_pidlist *l;
- /* don't need task_nsproxy() if we're looking at ourself */
- struct pid_namespace *ns = task_active_pid_ns(current);
-
- lockdep_assert_held(&cgrp->pidlist_mutex);
-
- list_for_each_entry(l, &cgrp->pidlists, links)
- if (l->key.type == type && l->key.ns == ns)
- return l;
- return NULL;
-}
-
-/*
- * find the appropriate pidlist for our purpose (given procs vs tasks)
- * returns with the lock on that pidlist already held, and takes care
- * of the use count, or returns NULL with no locks held if we're out of
- * memory.
- */
-static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
- enum cgroup_filetype type)
-{
- struct cgroup_pidlist *l;
-
- lockdep_assert_held(&cgrp->pidlist_mutex);
-
- l = cgroup_pidlist_find(cgrp, type);
- if (l)
- return l;
-
- /* entry not found; create a new one */
- l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
- if (!l)
- return l;
-
- INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
- l->key.type = type;
- /* don't need task_nsproxy() if we're looking at ourself */
- l->key.ns = get_pid_ns(task_active_pid_ns(current));
- l->owner = cgrp;
- list_add(&l->links, &cgrp->pidlists);
- return l;
-}
-
-/*
- * Load a cgroup's pidarray with either procs' tgids or tasks' pids
- */
-static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
- struct cgroup_pidlist **lp)
-{
- pid_t *array;
- int length;
- int pid, n = 0; /* used for populating the array */
- struct css_task_iter it;
- struct task_struct *tsk;
- struct cgroup_pidlist *l;
-
- lockdep_assert_held(&cgrp->pidlist_mutex);
-
- /*
- * If cgroup gets more users after we read count, we won't have
- * enough space - tough. This race is indistinguishable to the
- * caller from the case that the additional cgroup users didn't
- * show up until sometime later on.
- */
- length = cgroup_task_count(cgrp);
- array = pidlist_allocate(length);
- if (!array)
- return -ENOMEM;
- /* now, populate the array */
- css_task_iter_start(&cgrp->self, &it);
- while ((tsk = css_task_iter_next(&it))) {
- if (unlikely(n == length))
- break;
- /* get tgid or pid for procs or tasks file respectively */
- if (type == CGROUP_FILE_PROCS)
- pid = task_tgid_vnr(tsk);
- else
- pid = task_pid_vnr(tsk);
- if (pid > 0) /* make sure to only use valid results */
- array[n++] = pid;
- }
- css_task_iter_end(&it);
- length = n;
- /* now sort & (if procs) strip out duplicates */
- if (cgroup_on_dfl(cgrp))
- sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
- else
- sort(array, length, sizeof(pid_t), cmppid, NULL);
- if (type == CGROUP_FILE_PROCS)
- length = pidlist_uniq(array, length);
-
- l = cgroup_pidlist_find_create(cgrp, type);
- if (!l) {
- pidlist_free(array);
- return -ENOMEM;
+ if (of->priv) {
+ css_task_iter_end(of->priv);
+ kfree(of->priv);
}
-
- /* store array, freeing old if necessary */
- pidlist_free(l->list);
- l->list = array;
- l->length = length;
- *lp = l;
- return 0;
}
-/**
- * cgroupstats_build - build and fill cgroupstats
- * @stats: cgroupstats to fill information into
- * @dentry: A dentry entry belonging to the cgroup for which stats have
- * been requested.
- *
- * Build and fill cgroupstats so that taskstats can export it to user
- * space.
- */
-int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
+static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
- struct cgroup *cgrp;
- struct css_task_iter it;
- struct task_struct *tsk;
-
- /* it should be kernfs_node belonging to cgroupfs and is a directory */
- if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
- kernfs_type(kn) != KERNFS_DIR)
- return -EINVAL;
-
- mutex_lock(&cgroup_mutex);
-
- /*
- * We aren't being called from kernfs and there's no guarantee on
- * @kn->priv's validity. For this and css_tryget_online_from_dir(),
- * @kn->priv is RCU safe. Let's do the RCU dancing.
- */
- rcu_read_lock();
- cgrp = rcu_dereference(kn->priv);
- if (!cgrp || cgroup_is_dead(cgrp)) {
- rcu_read_unlock();
- mutex_unlock(&cgroup_mutex);
- return -ENOENT;
- }
- rcu_read_unlock();
+ struct kernfs_open_file *of = s->private;
+ struct css_task_iter *it = of->priv;
+ struct task_struct *task;
- css_task_iter_start(&cgrp->self, &it);
- while ((tsk = css_task_iter_next(&it))) {
- switch (tsk->state) {
- case TASK_RUNNING:
- stats->nr_running++;
- break;
- case TASK_INTERRUPTIBLE:
- stats->nr_sleeping++;
- break;
- case TASK_UNINTERRUPTIBLE:
- stats->nr_uninterruptible++;
- break;
- case TASK_STOPPED:
- stats->nr_stopped++;
- break;
- default:
- if (delayacct_is_task_waiting_on_io(tsk))
- stats->nr_io_wait++;
- break;
- }
- }
- css_task_iter_end(&it);
+ do {
+ task = css_task_iter_next(it);
+ } while (task && !thread_group_leader(task));
- mutex_unlock(&cgroup_mutex);
- return 0;
+ return task;
}
-
-/*
- * seq_file methods for the tasks/procs files. The seq_file position is the
- * next pid to display; the seq_file iterator is a pointer to the pid
- * in the cgroup->l->list array.
- */
-
-static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
{
- /*
- * Initially we receive a position value that corresponds to
- * one more than the last pid shown (or 0 on the first call or
- * after a seek to the start). Use a binary-search to find the
- * next pid to display, if any
- */
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
- struct cgroup_pidlist *l;
- enum cgroup_filetype type = seq_cft(s)->private;
- int index = 0, pid = *pos;
- int *iter, ret;
-
- mutex_lock(&cgrp->pidlist_mutex);
+ struct css_task_iter *it = of->priv;
/*
- * !NULL @of->priv indicates that this isn't the first start()
- * after open. If the matching pidlist is around, we can use that.
- * Look for it. Note that @of->priv can't be used directly. It
- * could already have been destroyed.
+ * When a seq_file is seeked, it's always traversed sequentially
+ * from position 0, so we can simply keep iterating on !0 *pos.
*/
- if (of->priv)
- of->priv = cgroup_pidlist_find(cgrp, type);
-
- /*
- * Either this is the first start() after open or the matching
- * pidlist has been destroyed inbetween. Create a new one.
- */
- if (!of->priv) {
- ret = pidlist_array_load(cgrp, type,
- (struct cgroup_pidlist **)&of->priv);
- if (ret)
- return ERR_PTR(ret);
- }
- l = of->priv;
-
- if (pid) {
- int end = l->length;
-
- while (index < end) {
- int mid = (index + end) / 2;
- if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
- index = mid;
- break;
- } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
- index = mid + 1;
- else
- end = mid;
- }
- }
- /* If we're off the end of the array, we're done */
- if (index >= l->length)
- return NULL;
- /* Update the abstract position to be the actual pid that we found */
- iter = l->list + index;
- *pos = cgroup_pid_fry(cgrp, *iter);
- return iter;
-}
-
-static void cgroup_pidlist_stop(struct seq_file *s, void *v)
-{
- struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
-
- if (l)
- mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
- CGROUP_PIDLIST_DESTROY_DELAY);
- mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
-}
+ if (!it) {
+ if (WARN_ON_ONCE((*pos)++))
+ return ERR_PTR(-EINVAL);
-static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
-{
- struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
- pid_t *p = v;
- pid_t *end = l->list + l->length;
- /*
- * Advance to the next pid in the array. If this goes off the
- * end, we're done
- */
- p++;
- if (p >= end) {
- return NULL;
- } else {
- *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
- return p;
+ it = kzalloc(sizeof(*it), GFP_KERNEL);
+ if (!it)
+ return ERR_PTR(-ENOMEM);
+ of->priv = it;
+ css_task_iter_start(&cgrp->self, it);
+ } else if (!(*pos)++) {
+ css_task_iter_end(it);
+ css_task_iter_start(&cgrp->self, it);
}
-}
-
-static int cgroup_pidlist_show(struct seq_file *s, void *v)
-{
- seq_printf(s, "%d\n", *(int *)v);
- return 0;
+ return cgroup_procs_next(s, NULL, NULL);
}
-static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
- struct cftype *cft)
+static int cgroup_procs_show(struct seq_file *s, void *v)
{
- return notify_on_release(css->cgroup);
-}
-
-static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
- struct cftype *cft, u64 val)
-{
- if (val)
- set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
- else
- clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
- return 0;
-}
-
-static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-}
-
-static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
- struct cftype *cft, u64 val)
-{
- if (val)
- set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
- else
- clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+ seq_printf(s, "%d\n", task_tgid_vnr(v));
return 0;
}
/* cgroup core interface files for the default hierarchy */
-static struct cftype cgroup_dfl_base_files[] = {
+static struct cftype cgroup_base_files[] = {
{
.name = "cgroup.procs",
.file_offset = offsetof(struct cgroup, procs_file),
- .seq_start = cgroup_pidlist_start,
- .seq_next = cgroup_pidlist_next,
- .seq_stop = cgroup_pidlist_stop,
- .seq_show = cgroup_pidlist_show,
- .private = CGROUP_FILE_PROCS,
+ .release = cgroup_procs_release,
+ .seq_start = cgroup_procs_start,
+ .seq_next = cgroup_procs_next,
+ .seq_show = cgroup_procs_show,
.write = cgroup_procs_write,
},
{
@@ -4917,51 +3810,6 @@ static struct cftype cgroup_dfl_base_files[] = {
{ } /* terminate */
};
-/* cgroup core interface files for the legacy hierarchies */
-static struct cftype cgroup_legacy_base_files[] = {
- {
- .name = "cgroup.procs",
- .seq_start = cgroup_pidlist_start,
- .seq_next = cgroup_pidlist_next,
- .seq_stop = cgroup_pidlist_stop,
- .seq_show = cgroup_pidlist_show,
- .private = CGROUP_FILE_PROCS,
- .write = cgroup_procs_write,
- },
- {
- .name = "cgroup.clone_children",
- .read_u64 = cgroup_clone_children_read,
- .write_u64 = cgroup_clone_children_write,
- },
- {
- .name = "cgroup.sane_behavior",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .seq_show = cgroup_sane_behavior_show,
- },
- {
- .name = "tasks",
- .seq_start = cgroup_pidlist_start,
- .seq_next = cgroup_pidlist_next,
- .seq_stop = cgroup_pidlist_stop,
- .seq_show = cgroup_pidlist_show,
- .private = CGROUP_FILE_TASKS,
- .write = cgroup_tasks_write,
- },
- {
- .name = "notify_on_release",
- .read_u64 = cgroup_read_notify_on_release,
- .write_u64 = cgroup_write_notify_on_release,
- },
- {
- .name = "release_agent",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .seq_show = cgroup_release_agent_show,
- .write = cgroup_release_agent_write,
- .max_write_len = PATH_MAX - 1,
- },
- { } /* terminate */
-};
-
/*
* css destruction is four-stage process.
*
@@ -5007,7 +3855,7 @@ static void css_free_work_fn(struct work_struct *work)
} else {
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
- cgroup_pidlist_destroy_all(cgrp);
+ cgroup1_pidlist_destroy_all(cgrp);
cancel_work_sync(&cgrp->release_agent_work);
if (cgroup_parent(cgrp)) {
@@ -5302,8 +4150,7 @@ out_free_cgrp:
return ERR_PTR(ret);
}
-static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
- umode_t mode)
+int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
struct cgroup *parent, *cgrp;
struct kernfs_node *kn;
@@ -5507,7 +4354,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
*/
kernfs_remove(cgrp->kn);
- check_for_release(cgroup_parent(cgrp));
+ cgroup1_check_for_release(cgroup_parent(cgrp));
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
@@ -5515,7 +4362,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
return 0;
};
-static int cgroup_rmdir(struct kernfs_node *kn)
+int cgroup_rmdir(struct kernfs_node *kn)
{
struct cgroup *cgrp;
int ret = 0;
@@ -5535,10 +4382,8 @@ static int cgroup_rmdir(struct kernfs_node *kn)
static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
.remount_fs = cgroup_remount,
- .show_options = cgroup_show_options,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
- .rename = cgroup_rename,
.show_path = cgroup_show_path,
};
@@ -5646,8 +4491,8 @@ int __init cgroup_init(void)
BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
- BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
- BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
/*
* The latency of the synchronize_sched() is too high for cgroups,
@@ -5697,7 +4542,7 @@ int __init cgroup_init(void)
continue;
}
- if (cgroup_ssid_no_v1(ssid))
+ if (cgroup1_ssid_disabled(ssid))
printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
ss->name);
@@ -5744,15 +4589,6 @@ static int __init cgroup_wq_init(void)
*/
cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
BUG_ON(!cgroup_destroy_wq);
-
- /*
- * Used to destroy pidlists and separate to serve as flush domain.
- * Cap @max_active to 1 too.
- */
- cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
- 0, 1);
- BUG_ON(!cgroup_pidlist_destroy_wq);
-
return 0;
}
core_initcall(cgroup_wq_init);
@@ -5835,42 +4671,6 @@ out:
return retval;
}
-/* Display information about each subsystem and each hierarchy */
-static int proc_cgroupstats_show(struct seq_file *m, void *v)
-{
- struct cgroup_subsys *ss;
- int i;
-
- seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
- /*
- * ideally we don't want subsystems moving around while we do this.
- * cgroup_mutex is also necessary to guarantee an atomic snapshot of
- * subsys/hierarchy state.
- */
- mutex_lock(&cgroup_mutex);
-
- for_each_subsys(ss, i)
- seq_printf(m, "%s\t%d\t%d\t%d\n",
- ss->legacy_name, ss->root->hierarchy_id,
- atomic_read(&ss->root->nr_cgrps),
- cgroup_ssid_enabled(i));
-
- mutex_unlock(&cgroup_mutex);
- return 0;
-}
-
-static int cgroupstats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, proc_cgroupstats_show, NULL);
-}
-
-static const struct file_operations proc_cgroupstats_operations = {
- .open = cgroupstats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/**
* cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
@@ -6050,76 +4850,6 @@ void cgroup_free(struct task_struct *task)
put_css_set(cset);
}
-static void check_for_release(struct cgroup *cgrp)
-{
- if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
- !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
- schedule_work(&cgrp->release_agent_work);
-}
-
-/*
- * Notify userspace when a cgroup is released, by running the
- * configured release agent with the name of the cgroup (path
- * relative to the root of cgroup file system) as the argument.
- *
- * Most likely, this user command will try to rmdir this cgroup.
- *
- * This races with the possibility that some other task will be
- * attached to this cgroup before it is removed, or that some other
- * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
- * The presumed 'rmdir' will fail quietly if this cgroup is no longer
- * unused, and this cgroup will be reprieved from its death sentence,
- * to continue to serve a useful existence. Next time it's released,
- * we will get notified again, if it still has 'notify_on_release' set.
- *
- * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
- * means only wait until the task is successfully execve()'d. The
- * separate release agent task is forked by call_usermodehelper(),
- * then control in this thread returns here, without waiting for the
- * release agent task. We don't bother to wait because the caller of
- * this routine has no use for the exit status of the release agent
- * task, so no sense holding our caller up for that.
- */
-static void cgroup_release_agent(struct work_struct *work)
-{
- struct cgroup *cgrp =
- container_of(work, struct cgroup, release_agent_work);
- char *pathbuf = NULL, *agentbuf = NULL;
- char *argv[3], *envp[3];
- int ret;
-
- mutex_lock(&cgroup_mutex);
-
- pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
- agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
- if (!pathbuf || !agentbuf)
- goto out;
-
- spin_lock_irq(&css_set_lock);
- ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
- spin_unlock_irq(&css_set_lock);
- if (ret < 0 || ret >= PATH_MAX)
- goto out;
-
- argv[0] = agentbuf;
- argv[1] = pathbuf;
- argv[2] = NULL;
-
- /* minimal command environment */
- envp[0] = "HOME=/";
- envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
- envp[2] = NULL;
-
- mutex_unlock(&cgroup_mutex);
- call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
- goto out_free;
-out:
- mutex_unlock(&cgroup_mutex);
-out_free:
- kfree(agentbuf);
- kfree(pathbuf);
-}
-
static int __init cgroup_disable(char *str)
{
struct cgroup_subsys *ss;
@@ -6141,33 +4871,6 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
-static int __init cgroup_no_v1(char *str)
-{
- struct cgroup_subsys *ss;
- char *token;
- int i;
-
- while ((token = strsep(&str, ",")) != NULL) {
- if (!*token)
- continue;
-
- if (!strcmp(token, "all")) {
- cgroup_no_v1_mask = U16_MAX;
- break;
- }
-
- for_each_subsys(ss, i) {
- if (strcmp(token, ss->name) &&
- strcmp(token, ss->legacy_name))
- continue;
-
- cgroup_no_v1_mask |= 1 << i;
- }
- }
- return 1;
-}
-__setup("cgroup_no_v1=", cgroup_no_v1);
-
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
@@ -6197,7 +4900,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
* have been or be removed at any point. @kn->priv is RCU
* protected for this access. See css_release_work_fn() for details.
*/
- cgrp = rcu_dereference(kn->priv);
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
if (cgrp)
css = cgroup_css(cgrp, ss);
@@ -6349,154 +5052,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
#endif /* CONFIG_SOCK_CGROUP_DATA */
-/* cgroup namespaces */
-
-static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
-{
- return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
-}
-
-static void dec_cgroup_namespaces(struct ucounts *ucounts)
-{
- dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
-}
-
-static struct cgroup_namespace *alloc_cgroup_ns(void)
-{
- struct cgroup_namespace *new_ns;
- int ret;
-
- new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
- if (!new_ns)
- return ERR_PTR(-ENOMEM);
- ret = ns_alloc_inum(&new_ns->ns);
- if (ret) {
- kfree(new_ns);
- return ERR_PTR(ret);
- }
- atomic_set(&new_ns->count, 1);
- new_ns->ns.ops = &cgroupns_operations;
- return new_ns;
-}
-
-void free_cgroup_ns(struct cgroup_namespace *ns)
-{
- put_css_set(ns->root_cset);
- dec_cgroup_namespaces(ns->ucounts);
- put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
- kfree(ns);
-}
-EXPORT_SYMBOL(free_cgroup_ns);
-
-struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
- struct user_namespace *user_ns,
- struct cgroup_namespace *old_ns)
-{
- struct cgroup_namespace *new_ns;
- struct ucounts *ucounts;
- struct css_set *cset;
-
- BUG_ON(!old_ns);
-
- if (!(flags & CLONE_NEWCGROUP)) {
- get_cgroup_ns(old_ns);
- return old_ns;
- }
-
- /* Allow only sysadmin to create cgroup namespace. */
- if (!ns_capable(user_ns, CAP_SYS_ADMIN))
- return ERR_PTR(-EPERM);
-
- ucounts = inc_cgroup_namespaces(user_ns);
- if (!ucounts)
- return ERR_PTR(-ENOSPC);
-
- /* It is not safe to take cgroup_mutex here */
- spin_lock_irq(&css_set_lock);
- cset = task_css_set(current);
- get_css_set(cset);
- spin_unlock_irq(&css_set_lock);
-
- new_ns = alloc_cgroup_ns();
- if (IS_ERR(new_ns)) {
- put_css_set(cset);
- dec_cgroup_namespaces(ucounts);
- return new_ns;
- }
-
- new_ns->user_ns = get_user_ns(user_ns);
- new_ns->ucounts = ucounts;
- new_ns->root_cset = cset;
-
- return new_ns;
-}
-
-static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
-{
- return container_of(ns, struct cgroup_namespace, ns);
-}
-
-static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
-{
- struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
-
- if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
- !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
- return -EPERM;
-
- /* Don't need to do anything if we are attaching to our own cgroupns. */
- if (cgroup_ns == nsproxy->cgroup_ns)
- return 0;
-
- get_cgroup_ns(cgroup_ns);
- put_cgroup_ns(nsproxy->cgroup_ns);
- nsproxy->cgroup_ns = cgroup_ns;
-
- return 0;
-}
-
-static struct ns_common *cgroupns_get(struct task_struct *task)
-{
- struct cgroup_namespace *ns = NULL;
- struct nsproxy *nsproxy;
-
- task_lock(task);
- nsproxy = task->nsproxy;
- if (nsproxy) {
- ns = nsproxy->cgroup_ns;
- get_cgroup_ns(ns);
- }
- task_unlock(task);
-
- return ns ? &ns->ns : NULL;
-}
-
-static void cgroupns_put(struct ns_common *ns)
-{
- put_cgroup_ns(to_cg_ns(ns));
-}
-
-static struct user_namespace *cgroupns_owner(struct ns_common *ns)
-{
- return to_cg_ns(ns)->user_ns;
-}
-
-const struct proc_ns_operations cgroupns_operations = {
- .name = "cgroup",
- .type = CLONE_NEWCGROUP,
- .get = cgroupns_get,
- .put = cgroupns_put,
- .install = cgroupns_install,
- .owner = cgroupns_owner,
-};
-
-static __init int cgroup_namespaces_init(void)
-{
- return 0;
-}
-subsys_initcall(cgroup_namespaces_init);
-
#ifdef CONFIG_CGROUP_BPF
int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
enum bpf_attach_type type, bool overridable)
@@ -6510,149 +5065,3 @@ int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
return ret;
}
#endif /* CONFIG_CGROUP_BPF */
-
-#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *
-debug_css_alloc(struct cgroup_subsys_state *parent_css)
-{
- struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
- if (!css)
- return ERR_PTR(-ENOMEM);
-
- return css;
-}
-
-static void debug_css_free(struct cgroup_subsys_state *css)
-{
- kfree(css);
-}
-
-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return cgroup_task_count(css->cgroup);
-}
-
-static u64 current_css_set_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return (u64)(unsigned long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- u64 count;
-
- rcu_read_lock();
- count = atomic_read(&task_css_set(current)->refcount);
- rcu_read_unlock();
- return count;
-}
-
-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
-{
- struct cgrp_cset_link *link;
- struct css_set *cset;
- char *name_buf;
-
- name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
- if (!name_buf)
- return -ENOMEM;
-
- spin_lock_irq(&css_set_lock);
- rcu_read_lock();
- cset = rcu_dereference(current->cgroups);
- list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *c = link->cgrp;
-
- cgroup_name(c, name_buf, NAME_MAX + 1);
- seq_printf(seq, "Root %d group %s\n",
- c->root->hierarchy_id, name_buf);
- }
- rcu_read_unlock();
- spin_unlock_irq(&css_set_lock);
- kfree(name_buf);
- return 0;
-}
-
-#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct seq_file *seq, void *v)
-{
- struct cgroup_subsys_state *css = seq_css(seq);
- struct cgrp_cset_link *link;
-
- spin_lock_irq(&css_set_lock);
- list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
- struct css_set *cset = link->cset;
- struct task_struct *task;
- int count = 0;
-
- seq_printf(seq, "css_set %p\n", cset);
-
- list_for_each_entry(task, &cset->tasks, cg_list) {
- if (count++ > MAX_TASKS_SHOWN_PER_CSS)
- goto overflow;
- seq_printf(seq, " task %d\n", task_pid_vnr(task));
- }
-
- list_for_each_entry(task, &cset->mg_tasks, cg_list) {
- if (count++ > MAX_TASKS_SHOWN_PER_CSS)
- goto overflow;
- seq_printf(seq, " task %d\n", task_pid_vnr(task));
- }
- continue;
- overflow:
- seq_puts(seq, " ...\n");
- }
- spin_unlock_irq(&css_set_lock);
- return 0;
-}
-
-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
- return (!cgroup_is_populated(css->cgroup) &&
- !css_has_online_children(&css->cgroup->self));
-}
-
-static struct cftype debug_files[] = {
- {
- .name = "taskcount",
- .read_u64 = debug_taskcount_read,
- },
-
- {
- .name = "current_css_set",
- .read_u64 = current_css_set_read,
- },
-
- {
- .name = "current_css_set_refcount",
- .read_u64 = current_css_set_refcount_read,
- },
-
- {
- .name = "current_css_set_cg_links",
- .seq_show = current_css_set_cg_links_read,
- },
-
- {
- .name = "cgroup_css_links",
- .seq_show = cgroup_css_links_read,
- },
-
- {
- .name = "releasable",
- .read_u64 = releasable_read,
- },
-
- { } /* terminate */
-};
-
-struct cgroup_subsys debug_cgrp_subsys = {
- .css_alloc = debug_css_alloc,
- .css_free = debug_css_free,
- .legacy_cftypes = debug_files,
-};
-#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cpuset.c b/kernel/cgroup/cpuset.c
index b3088886cd37..b3088886cd37 100644
--- a/kernel/cpuset.c
+++ b/kernel/cgroup/cpuset.c
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup/freezer.c
index 1b72d56edce5..1b72d56edce5 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup/freezer.c
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
new file mode 100644
index 000000000000..cff7ea62c38f
--- /dev/null
+++ b/kernel/cgroup/namespace.c
@@ -0,0 +1,155 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
+#include <linux/proc_ns.h>
+
+
+/* cgroup namespaces */
+
+static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
+}
+
+static void dec_cgroup_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
+}
+
+static struct cgroup_namespace *alloc_cgroup_ns(void)
+{
+ struct cgroup_namespace *new_ns;
+ int ret;
+
+ new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+ if (!new_ns)
+ return ERR_PTR(-ENOMEM);
+ ret = ns_alloc_inum(&new_ns->ns);
+ if (ret) {
+ kfree(new_ns);
+ return ERR_PTR(ret);
+ }
+ atomic_set(&new_ns->count, 1);
+ new_ns->ns.ops = &cgroupns_operations;
+ return new_ns;
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns)
+{
+ put_css_set(ns->root_cset);
+ dec_cgroup_namespaces(ns->ucounts);
+ put_user_ns(ns->user_ns);
+ ns_free_inum(&ns->ns);
+ kfree(ns);
+}
+EXPORT_SYMBOL(free_cgroup_ns);
+
+struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+ struct user_namespace *user_ns,
+ struct cgroup_namespace *old_ns)
+{
+ struct cgroup_namespace *new_ns;
+ struct ucounts *ucounts;
+ struct css_set *cset;
+
+ BUG_ON(!old_ns);
+
+ if (!(flags & CLONE_NEWCGROUP)) {
+ get_cgroup_ns(old_ns);
+ return old_ns;
+ }
+
+ /* Allow only sysadmin to create cgroup namespace. */
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ ucounts = inc_cgroup_namespaces(user_ns);
+ if (!ucounts)
+ return ERR_PTR(-ENOSPC);
+
+ /* It is not safe to take cgroup_mutex here */
+ spin_lock_irq(&css_set_lock);
+ cset = task_css_set(current);
+ get_css_set(cset);
+ spin_unlock_irq(&css_set_lock);
+
+ new_ns = alloc_cgroup_ns();
+ if (IS_ERR(new_ns)) {
+ put_css_set(cset);
+ dec_cgroup_namespaces(ucounts);
+ return new_ns;
+ }
+
+ new_ns->user_ns = get_user_ns(user_ns);
+ new_ns->ucounts = ucounts;
+ new_ns->root_cset = cset;
+
+ return new_ns;
+}
+
+static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct cgroup_namespace, ns);
+}
+
+static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+ struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
+
+ if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+ !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* Don't need to do anything if we are attaching to our own cgroupns. */
+ if (cgroup_ns == nsproxy->cgroup_ns)
+ return 0;
+
+ get_cgroup_ns(cgroup_ns);
+ put_cgroup_ns(nsproxy->cgroup_ns);
+ nsproxy->cgroup_ns = cgroup_ns;
+
+ return 0;
+}
+
+static struct ns_common *cgroupns_get(struct task_struct *task)
+{
+ struct cgroup_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = nsproxy->cgroup_ns;
+ get_cgroup_ns(ns);
+ }
+ task_unlock(task);
+
+ return ns ? &ns->ns : NULL;
+}
+
+static void cgroupns_put(struct ns_common *ns)
+{
+ put_cgroup_ns(to_cg_ns(ns));
+}
+
+static struct user_namespace *cgroupns_owner(struct ns_common *ns)
+{
+ return to_cg_ns(ns)->user_ns;
+}
+
+const struct proc_ns_operations cgroupns_operations = {
+ .name = "cgroup",
+ .type = CLONE_NEWCGROUP,
+ .get = cgroupns_get,
+ .put = cgroupns_put,
+ .install = cgroupns_install,
+ .owner = cgroupns_owner,
+};
+
+static __init int cgroup_namespaces_init(void)
+{
+ return 0;
+}
+subsys_initcall(cgroup_namespaces_init);
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup/pids.c
index 2bd673783f1a..2bd673783f1a 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup/pids.c
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
new file mode 100644
index 000000000000..defad3c5e7dc
--- /dev/null
+++ b/kernel/cgroup/rdma.c
@@ -0,0 +1,619 @@
+/*
+ * RDMA resource limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop processes from consuming
+ * additional RDMA resources after a certain limit is reached.
+ *
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/cgroup.h>
+#include <linux/parser.h>
+#include <linux/cgroup_rdma.h>
+
+#define RDMACG_MAX_STR "max"
+
+/*
+ * Protects list of resource pools maintained on per cgroup basis
+ * and rdma device list.
+ */
+static DEFINE_MUTEX(rdmacg_mutex);
+static LIST_HEAD(rdmacg_devices);
+
+enum rdmacg_file_type {
+ RDMACG_RESOURCE_TYPE_MAX,
+ RDMACG_RESOURCE_TYPE_STAT,
+};
+
+/*
+ * resource table definition as to be seen by the user.
+ * Need to add entries to it when more resources are
+ * added/defined at IB verb/core layer.
+ */
+static char const *rdmacg_resource_names[] = {
+ [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle",
+ [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object",
+};
+
+/* resource tracker for each resource of rdma cgroup */
+struct rdmacg_resource {
+ int max;
+ int usage;
+};
+
+/*
+ * resource pool object which represents per cgroup, per device
+ * resources. There are multiple instances of this object per cgroup,
+ * therefore it cannot be embedded within rdma_cgroup structure. It
+ * is maintained as list.
+ */
+struct rdmacg_resource_pool {
+ struct rdmacg_device *device;
+ struct rdmacg_resource resources[RDMACG_RESOURCE_MAX];
+
+ struct list_head cg_node;
+ struct list_head dev_node;
+
+ /* count active user tasks of this pool */
+ u64 usage_sum;
+ /* total number counts which are set to max */
+ int num_max_cnt;
+};
+
+static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
+{
+ return container_of(css, struct rdma_cgroup, css);
+}
+
+static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
+{
+ return css_rdmacg(cg->css.parent);
+}
+
+static inline struct rdma_cgroup *get_current_rdmacg(void)
+{
+ return css_rdmacg(task_get_css(current, rdma_cgrp_id));
+}
+
+static void set_resource_limit(struct rdmacg_resource_pool *rpool,
+ int index, int new_max)
+{
+ if (new_max == S32_MAX) {
+ if (rpool->resources[index].max != S32_MAX)
+ rpool->num_max_cnt++;
+ } else {
+ if (rpool->resources[index].max == S32_MAX)
+ rpool->num_max_cnt--;
+ }
+ rpool->resources[index].max = new_max;
+}
+
+static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
+{
+ int i;
+
+ for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
+ set_resource_limit(rpool, i, S32_MAX);
+}
+
+static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
+{
+ lockdep_assert_held(&rdmacg_mutex);
+
+ list_del(&rpool->cg_node);
+ list_del(&rpool->dev_node);
+ kfree(rpool);
+}
+
+static struct rdmacg_resource_pool *
+find_cg_rpool_locked(struct rdma_cgroup *cg,
+ struct rdmacg_device *device)
+
+{
+ struct rdmacg_resource_pool *pool;
+
+ lockdep_assert_held(&rdmacg_mutex);
+
+ list_for_each_entry(pool, &cg->rpools, cg_node)
+ if (pool->device == device)
+ return pool;
+
+ return NULL;
+}
+
+static struct rdmacg_resource_pool *
+get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
+{
+ struct rdmacg_resource_pool *rpool;
+
+ rpool = find_cg_rpool_locked(cg, device);
+ if (rpool)
+ return rpool;
+
+ rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
+ if (!rpool)
+ return ERR_PTR(-ENOMEM);
+
+ rpool->device = device;
+ set_all_resource_max_limit(rpool);
+
+ INIT_LIST_HEAD(&rpool->cg_node);
+ INIT_LIST_HEAD(&rpool->dev_node);
+ list_add_tail(&rpool->cg_node, &cg->rpools);
+ list_add_tail(&rpool->dev_node, &device->rpools);
+ return rpool;
+}
+
+/**
+ * uncharge_cg_locked - uncharge resource for rdma cgroup
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cg (resource pool)
+ *
+ * It also frees the resource pool which was created as part of
+ * charging operation when there are no resources attached to
+ * resource pool.
+ */
+static void
+uncharge_cg_locked(struct rdma_cgroup *cg,
+ struct rdmacg_device *device,
+ enum rdmacg_resource_type index)
+{
+ struct rdmacg_resource_pool *rpool;
+
+ rpool = find_cg_rpool_locked(cg, device);
+
+ /*
+ * rpool cannot be null at this stage. Let kernel operate in case
+ * if there a bug in IB stack or rdma controller, instead of crashing
+ * the system.
+ */
+ if (unlikely(!rpool)) {
+ pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
+ return;
+ }
+
+ rpool->resources[index].usage--;
+
+ /*
+ * A negative count (or overflow) is invalid,
+ * it indicates a bug in the rdma controller.
+ */
+ WARN_ON_ONCE(rpool->resources[index].usage < 0);
+ rpool->usage_sum--;
+ if (rpool->usage_sum == 0 &&
+ rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+ /*
+ * No user of the rpool and all entries are set to max, so
+ * safe to delete this rpool.
+ */
+ free_cg_rpool_locked(rpool);
+ }
+}
+
+/**
+ * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
+ * stop uncharging
+ * @index: index of the resource to uncharge in cg in given resource pool
+ */
+static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
+ struct rdmacg_device *device,
+ struct rdma_cgroup *stop_cg,
+ enum rdmacg_resource_type index)
+{
+ struct rdma_cgroup *p;
+
+ mutex_lock(&rdmacg_mutex);
+
+ for (p = cg; p != stop_cg; p = parent_rdmacg(p))
+ uncharge_cg_locked(p, device, index);
+
+ mutex_unlock(&rdmacg_mutex);
+
+ css_put(&cg->css);
+}
+
+/**
+ * rdmacg_uncharge - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cgroup in given resource pool
+ */
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+ struct rdmacg_device *device,
+ enum rdmacg_resource_type index)
+{
+ if (index >= RDMACG_RESOURCE_MAX)
+ return;
+
+ rdmacg_uncharge_hierarchy(cg, device, NULL, index);
+}
+EXPORT_SYMBOL(rdmacg_uncharge);
+
+/**
+ * rdmacg_try_charge - hierarchically try to charge the rdma resource
+ * @rdmacg: pointer to rdma cgroup which will own this resource
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to charge in cgroup (resource pool)
+ *
+ * This function follows charging resource in hierarchical way.
+ * It will fail if the charge would cause the new value to exceed the
+ * hierarchical limit.
+ * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
+ * Returns pointer to rdmacg for this resource when charging is successful.
+ *
+ * Charger needs to account resources on two criteria.
+ * (a) per cgroup & (b) per device resource usage.
+ * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
+ * the configured limits. Per device provides granular configuration
+ * in multi device usage. It allocates resource pool in the hierarchy
+ * for each parent it come across for first resource. Later on resource
+ * pool will be available. Therefore it will be much faster thereon
+ * to charge/uncharge.
+ */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+ struct rdmacg_device *device,
+ enum rdmacg_resource_type index)
+{
+ struct rdma_cgroup *cg, *p;
+ struct rdmacg_resource_pool *rpool;
+ s64 new;
+ int ret = 0;
+
+ if (index >= RDMACG_RESOURCE_MAX)
+ return -EINVAL;
+
+ /*
+ * hold on to css, as cgroup can be removed but resource
+ * accounting happens on css.
+ */
+ cg = get_current_rdmacg();
+
+ mutex_lock(&rdmacg_mutex);
+ for (p = cg; p; p = parent_rdmacg(p)) {
+ rpool = get_cg_rpool_locked(p, device);
+ if (IS_ERR(rpool)) {
+ ret = PTR_ERR(rpool);
+ goto err;
+ } else {
+ new = rpool->resources[index].usage + 1;
+ if (new > rpool->resources[index].max) {
+ ret = -EAGAIN;
+ goto err;
+ } else {
+ rpool->resources[index].usage = new;
+ rpool->usage_sum++;
+ }
+ }
+ }
+ mutex_unlock(&rdmacg_mutex);
+
+ *rdmacg = cg;
+ return 0;
+
+err:
+ mutex_unlock(&rdmacg_mutex);
+ rdmacg_uncharge_hierarchy(cg, device, p, index);
+ return ret;
+}
+EXPORT_SYMBOL(rdmacg_try_charge);
+
+/**
+ * rdmacg_register_device - register rdmacg device to rdma controller.
+ * @device: pointer to rdmacg device whose resources need to be accounted.
+ *
+ * If IB stack wish a device to participate in rdma cgroup resource
+ * tracking, it must invoke this API to register with rdma cgroup before
+ * any user space application can start using the RDMA resources.
+ * Returns 0 on success or EINVAL when table length given is beyond
+ * supported size.
+ */
+int rdmacg_register_device(struct rdmacg_device *device)
+{
+ INIT_LIST_HEAD(&device->dev_node);
+ INIT_LIST_HEAD(&device->rpools);
+
+ mutex_lock(&rdmacg_mutex);
+ list_add_tail(&device->dev_node, &rdmacg_devices);
+ mutex_unlock(&rdmacg_mutex);
+ return 0;
+}
+EXPORT_SYMBOL(rdmacg_register_device);
+
+/**
+ * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
+ * @device: pointer to rdmacg device which was previously registered with rdma
+ * controller using rdmacg_register_device().
+ *
+ * IB stack must invoke this after all the resources of the IB device
+ * are destroyed and after ensuring that no more resources will be created
+ * when this API is invoked.
+ */
+void rdmacg_unregister_device(struct rdmacg_device *device)
+{
+ struct rdmacg_resource_pool *rpool, *tmp;
+
+ /*
+ * Synchronize with any active resource settings,
+ * usage query happening via configfs.
+ */
+ mutex_lock(&rdmacg_mutex);
+ list_del_init(&device->dev_node);
+
+ /*
+ * Now that this device is off the cgroup list, its safe to free
+ * all the rpool resources.
+ */
+ list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
+ free_cg_rpool_locked(rpool);
+
+ mutex_unlock(&rdmacg_mutex);
+}
+EXPORT_SYMBOL(rdmacg_unregister_device);
+
+static int parse_resource(char *c, int *intval)
+{
+ substring_t argstr;
+ const char **table = &rdmacg_resource_names[0];
+ char *name, *value = c;
+ size_t len;
+ int ret, i = 0;
+
+ name = strsep(&value, "=");
+ if (!name || !value)
+ return -EINVAL;
+
+ len = strlen(value);
+
+ for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+ if (strcmp(table[i], name))
+ continue;
+
+ argstr.from = value;
+ argstr.to = value + len;
+
+ ret = match_int(&argstr, intval);
+ if (ret >= 0) {
+ if (*intval < 0)
+ break;
+ return i;
+ }
+ if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
+ *intval = S32_MAX;
+ return i;
+ }
+ break;
+ }
+ return -EINVAL;
+}
+
+static int rdmacg_parse_limits(char *options,
+ int *new_limits, unsigned long *enables)
+{
+ char *c;
+ int err = -EINVAL;
+
+ /* parse resource options */
+ while ((c = strsep(&options, " ")) != NULL) {
+ int index, intval;
+
+ index = parse_resource(c, &intval);
+ if (index < 0)
+ goto err;
+
+ new_limits[index] = intval;
+ *enables |= BIT(index);
+ }
+ return 0;
+
+err:
+ return err;
+}
+
+static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
+{
+ struct rdmacg_device *device;
+
+ lockdep_assert_held(&rdmacg_mutex);
+
+ list_for_each_entry(device, &rdmacg_devices, dev_node)
+ if (!strcmp(name, device->name))
+ return device;
+
+ return NULL;
+}
+
+static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdma_cgroup *cg = css_rdmacg(of_css(of));
+ const char *dev_name;
+ struct rdmacg_resource_pool *rpool;
+ struct rdmacg_device *device;
+ char *options = strstrip(buf);
+ int *new_limits;
+ unsigned long enables = 0;
+ int i = 0, ret = 0;
+
+ /* extract the device name first */
+ dev_name = strsep(&options, " ");
+ if (!dev_name) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
+ if (!new_limits) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = rdmacg_parse_limits(options, new_limits, &enables);
+ if (ret)
+ goto parse_err;
+
+ /* acquire lock to synchronize with hot plug devices */
+ mutex_lock(&rdmacg_mutex);
+
+ device = rdmacg_get_device_locked(dev_name);
+ if (!device) {
+ ret = -ENODEV;
+ goto dev_err;
+ }
+
+ rpool = get_cg_rpool_locked(cg, device);
+ if (IS_ERR(rpool)) {
+ ret = PTR_ERR(rpool);
+ goto dev_err;
+ }
+
+ /* now set the new limits of the rpool */
+ for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
+ set_resource_limit(rpool, i, new_limits[i]);
+
+ if (rpool->usage_sum == 0 &&
+ rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+ /*
+ * No user of the rpool and all entries are set to max, so
+ * safe to delete this rpool.
+ */
+ free_cg_rpool_locked(rpool);
+ }
+
+dev_err:
+ mutex_unlock(&rdmacg_mutex);
+
+parse_err:
+ kfree(new_limits);
+
+err:
+ return ret ?: nbytes;
+}
+
+static void print_rpool_values(struct seq_file *sf,
+ struct rdmacg_resource_pool *rpool)
+{
+ enum rdmacg_file_type sf_type;
+ int i;
+ u32 value;
+
+ sf_type = seq_cft(sf)->private;
+
+ for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+ seq_puts(sf, rdmacg_resource_names[i]);
+ seq_putc(sf, '=');
+ if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
+ if (rpool)
+ value = rpool->resources[i].max;
+ else
+ value = S32_MAX;
+ } else {
+ if (rpool)
+ value = rpool->resources[i].usage;
+ else
+ value = 0;
+ }
+
+ if (value == S32_MAX)
+ seq_puts(sf, RDMACG_MAX_STR);
+ else
+ seq_printf(sf, "%d", value);
+ seq_putc(sf, ' ');
+ }
+}
+
+static int rdmacg_resource_read(struct seq_file *sf, void *v)
+{
+ struct rdmacg_device *device;
+ struct rdmacg_resource_pool *rpool;
+ struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
+
+ mutex_lock(&rdmacg_mutex);
+
+ list_for_each_entry(device, &rdmacg_devices, dev_node) {
+ seq_printf(sf, "%s ", device->name);
+
+ rpool = find_cg_rpool_locked(cg, device);
+ print_rpool_values(sf, rpool);
+
+ seq_putc(sf, '\n');
+ }
+
+ mutex_unlock(&rdmacg_mutex);
+ return 0;
+}
+
+static struct cftype rdmacg_files[] = {
+ {
+ .name = "max",
+ .write = rdmacg_resource_set_max,
+ .seq_show = rdmacg_resource_read,
+ .private = RDMACG_RESOURCE_TYPE_MAX,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .seq_show = rdmacg_resource_read,
+ .private = RDMACG_RESOURCE_TYPE_STAT,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ { } /* terminate */
+};
+
+static struct cgroup_subsys_state *
+rdmacg_css_alloc(struct cgroup_subsys_state *parent)
+{
+ struct rdma_cgroup *cg;
+
+ cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+ if (!cg)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&cg->rpools);
+ return &cg->css;
+}
+
+static void rdmacg_css_free(struct cgroup_subsys_state *css)
+{
+ struct rdma_cgroup *cg = css_rdmacg(css);
+
+ kfree(cg);
+}
+
+/**
+ * rdmacg_css_offline - cgroup css_offline callback
+ * @css: css of interest
+ *
+ * This function is called when @css is about to go away and responsible
+ * for shooting down all rdmacg associated with @css. As part of that it
+ * marks all the resource pool entries to max value, so that when resources are
+ * uncharged, associated resource pool can be freed as well.
+ */
+static void rdmacg_css_offline(struct cgroup_subsys_state *css)
+{
+ struct rdma_cgroup *cg = css_rdmacg(css);
+ struct rdmacg_resource_pool *rpool;
+
+ mutex_lock(&rdmacg_mutex);
+
+ list_for_each_entry(rpool, &cg->rpools, cg_node)
+ set_all_resource_max_limit(rpool);
+
+ mutex_unlock(&rdmacg_mutex);
+}
+
+struct cgroup_subsys rdma_cgrp_subsys = {
+ .css_alloc = rdmacg_css_alloc,
+ .css_free = rdmacg_css_free,
+ .css_offline = rdmacg_css_offline,
+ .legacy_cftypes = rdmacg_files,
+ .dfl_cftypes = rdmacg_files,
+};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b2eb3542e829..5b4e0b98f4eb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10959,5 +10959,11 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
.css_alloc = perf_cgroup_css_alloc,
.css_free = perf_cgroup_css_free,
.attach = perf_cgroup_attach,
+ /*
+ * Implicitly enable on dfl hierarchy so that perf events can
+ * always be filtered by cgroup2 path as long as perf_event
+ * controller is not mounted on a legacy hierarchy.
+ */
+ .implicit_on_dfl = true,
};
#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index a9b8cf500591..6c9cb208ac48 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -236,12 +236,28 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry
static inline struct jump_entry *static_key_entries(struct static_key *key)
{
- return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK);
+ WARN_ON_ONCE(key->type & JUMP_TYPE_LINKED);
+ return (struct jump_entry *)(key->type & ~JUMP_TYPE_MASK);
}
static inline bool static_key_type(struct static_key *key)
{
- return (unsigned long)key->entries & JUMP_TYPE_MASK;
+ return key->type & JUMP_TYPE_TRUE;
+}
+
+static inline bool static_key_linked(struct static_key *key)
+{
+ return key->type & JUMP_TYPE_LINKED;
+}
+
+static inline void static_key_clear_linked(struct static_key *key)
+{
+ key->type &= ~JUMP_TYPE_LINKED;
+}
+
+static inline void static_key_set_linked(struct static_key *key)
+{
+ key->type |= JUMP_TYPE_LINKED;
}
static inline struct static_key *jump_entry_key(struct jump_entry *entry)
@@ -254,6 +270,26 @@ static bool jump_entry_branch(struct jump_entry *entry)
return (unsigned long)entry->key & 1UL;
}
+/***
+ * A 'struct static_key' uses a union such that it either points directly
+ * to a table of 'struct jump_entry' or to a linked list of modules which in
+ * turn point to 'struct jump_entry' tables.
+ *
+ * The two lower bits of the pointer are used to keep track of which pointer
+ * type is in use and to store the initial branch direction, we use an access
+ * function which preserves these bits.
+ */
+static void static_key_set_entries(struct static_key *key,
+ struct jump_entry *entries)
+{
+ unsigned long type;
+
+ WARN_ON_ONCE((unsigned long)entries & JUMP_TYPE_MASK);
+ type = key->type & JUMP_TYPE_MASK;
+ key->entries = entries;
+ key->type |= type;
+}
+
static enum jump_label_type jump_label_type(struct jump_entry *entry)
{
struct static_key *key = jump_entry_key(entry);
@@ -313,13 +349,7 @@ void __init jump_label_init(void)
continue;
key = iterk;
- /*
- * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
- */
- *((unsigned long *)&key->entries) += (unsigned long)iter;
-#ifdef CONFIG_MODULES
- key->next = NULL;
-#endif
+ static_key_set_entries(key, iter);
}
static_key_initialized = true;
jump_label_unlock();
@@ -343,6 +373,29 @@ struct static_key_mod {
struct module *mod;
};
+static inline struct static_key_mod *static_key_mod(struct static_key *key)
+{
+ WARN_ON_ONCE(!(key->type & JUMP_TYPE_LINKED));
+ return (struct static_key_mod *)(key->type & ~JUMP_TYPE_MASK);
+}
+
+/***
+ * key->type and key->next are the same via union.
+ * This sets key->next and preserves the type bits.
+ *
+ * See additional comments above static_key_set_entries().
+ */
+static void static_key_set_mod(struct static_key *key,
+ struct static_key_mod *mod)
+{
+ unsigned long type;
+
+ WARN_ON_ONCE((unsigned long)mod & JUMP_TYPE_MASK);
+ type = key->type & JUMP_TYPE_MASK;
+ key->next = mod;
+ key->type |= type;
+}
+
static int __jump_label_mod_text_reserved(void *start, void *end)
{
struct module *mod;
@@ -365,11 +418,23 @@ static void __jump_label_mod_update(struct static_key *key)
{
struct static_key_mod *mod;
- for (mod = key->next; mod; mod = mod->next) {
- struct module *m = mod->mod;
+ for (mod = static_key_mod(key); mod; mod = mod->next) {
+ struct jump_entry *stop;
+ struct module *m;
+
+ /*
+ * NULL if the static_key is defined in a module
+ * that does not use it
+ */
+ if (!mod->entries)
+ continue;
- __jump_label_update(key, mod->entries,
- m->jump_entries + m->num_jump_entries);
+ m = mod->mod;
+ if (!m)
+ stop = __stop___jump_table;
+ else
+ stop = m->jump_entries + m->num_jump_entries;
+ __jump_label_update(key, mod->entries, stop);
}
}
@@ -404,7 +469,7 @@ static int jump_label_add_module(struct module *mod)
struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
struct jump_entry *iter;
struct static_key *key = NULL;
- struct static_key_mod *jlm;
+ struct static_key_mod *jlm, *jlm2;
/* if the module doesn't have jump label entries, just return */
if (iter_start == iter_stop)
@@ -421,20 +486,32 @@ static int jump_label_add_module(struct module *mod)
key = iterk;
if (within_module(iter->key, mod)) {
- /*
- * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
- */
- *((unsigned long *)&key->entries) += (unsigned long)iter;
- key->next = NULL;
+ static_key_set_entries(key, iter);
continue;
}
jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
if (!jlm)
return -ENOMEM;
+ if (!static_key_linked(key)) {
+ jlm2 = kzalloc(sizeof(struct static_key_mod),
+ GFP_KERNEL);
+ if (!jlm2) {
+ kfree(jlm);
+ return -ENOMEM;
+ }
+ preempt_disable();
+ jlm2->mod = __module_address((unsigned long)key);
+ preempt_enable();
+ jlm2->entries = static_key_entries(key);
+ jlm2->next = NULL;
+ static_key_set_mod(key, jlm2);
+ static_key_set_linked(key);
+ }
jlm->mod = mod;
jlm->entries = iter;
- jlm->next = key->next;
- key->next = jlm;
+ jlm->next = static_key_mod(key);
+ static_key_set_mod(key, jlm);
+ static_key_set_linked(key);
/* Only update if we've changed from our initial state */
if (jump_label_type(iter) != jump_label_init_type(iter))
@@ -461,16 +538,34 @@ static void jump_label_del_module(struct module *mod)
if (within_module(iter->key, mod))
continue;
+ /* No memory during module load */
+ if (WARN_ON(!static_key_linked(key)))
+ continue;
+
prev = &key->next;
- jlm = key->next;
+ jlm = static_key_mod(key);
while (jlm && jlm->mod != mod) {
prev = &jlm->next;
jlm = jlm->next;
}
- if (jlm) {
+ /* No memory during module load */
+ if (WARN_ON(!jlm))
+ continue;
+
+ if (prev == &key->next)
+ static_key_set_mod(key, jlm->next);
+ else
*prev = jlm->next;
+
+ kfree(jlm);
+
+ jlm = static_key_mod(key);
+ /* if only one etry is left, fold it back into the static_key */
+ if (jlm->next == NULL) {
+ static_key_set_entries(key, jlm->entries);
+ static_key_clear_linked(key);
kfree(jlm);
}
}
@@ -499,8 +594,10 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
case MODULE_STATE_COMING:
jump_label_lock();
ret = jump_label_add_module(mod);
- if (ret)
+ if (ret) {
+ WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n");
jump_label_del_module(mod);
+ }
jump_label_unlock();
break;
case MODULE_STATE_GOING:
@@ -561,11 +658,14 @@ int jump_label_text_reserved(void *start, void *end)
static void jump_label_update(struct static_key *key)
{
struct jump_entry *stop = __stop___jump_table;
- struct jump_entry *entry = static_key_entries(key);
+ struct jump_entry *entry;
#ifdef CONFIG_MODULES
struct module *mod;
- __jump_label_mod_update(key);
+ if (static_key_linked(key)) {
+ __jump_label_mod_update(key);
+ return;
+ }
preempt_disable();
mod = __module_address((unsigned long)key);
@@ -573,6 +673,7 @@ static void jump_label_update(struct static_key *key)
stop = mod->jump_entries + mod->num_jump_entries;
preempt_enable();
#endif
+ entry = static_key_entries(key);
/* if there are no users, entry can be NULL */
if (entry)
__jump_label_update(key, entry, stop);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eb230f06ba41..0c0609326391 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1110,13 +1110,6 @@ struct ftrace_func_entry {
unsigned long ip;
};
-struct ftrace_hash {
- unsigned long size_bits;
- struct hlist_head *buckets;
- unsigned long count;
- struct rcu_head rcu;
-};
-
/*
* We make these constant because no one should touch them,
* but they are used as the default "empty hash", to avoid allocating
@@ -1192,26 +1185,24 @@ struct ftrace_page {
static struct ftrace_page *ftrace_pages_start;
static struct ftrace_page *ftrace_pages;
-static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash)
+static __always_inline unsigned long
+ftrace_hash_key(struct ftrace_hash *hash, unsigned long ip)
{
- return !hash || !hash->count;
+ if (hash->size_bits > 0)
+ return hash_long(ip, hash->size_bits);
+
+ return 0;
}
-static struct ftrace_func_entry *
-ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
+/* Only use this function if ftrace_hash_empty() has already been tested */
+static __always_inline struct ftrace_func_entry *
+__ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
{
unsigned long key;
struct ftrace_func_entry *entry;
struct hlist_head *hhd;
- if (ftrace_hash_empty(hash))
- return NULL;
-
- if (hash->size_bits > 0)
- key = hash_long(ip, hash->size_bits);
- else
- key = 0;
-
+ key = ftrace_hash_key(hash, ip);
hhd = &hash->buckets[key];
hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) {
@@ -1221,17 +1212,32 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
return NULL;
}
+/**
+ * ftrace_lookup_ip - Test to see if an ip exists in an ftrace_hash
+ * @hash: The hash to look at
+ * @ip: The instruction pointer to test
+ *
+ * Search a given @hash to see if a given instruction pointer (@ip)
+ * exists in it.
+ *
+ * Returns the entry that holds the @ip if found. NULL otherwise.
+ */
+struct ftrace_func_entry *
+ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
+{
+ if (ftrace_hash_empty(hash))
+ return NULL;
+
+ return __ftrace_lookup_ip(hash, ip);
+}
+
static void __add_hash_entry(struct ftrace_hash *hash,
struct ftrace_func_entry *entry)
{
struct hlist_head *hhd;
unsigned long key;
- if (hash->size_bits)
- key = hash_long(entry->ip, hash->size_bits);
- else
- key = 0;
-
+ key = ftrace_hash_key(hash, entry->ip);
hhd = &hash->buckets[key];
hlist_add_head(&entry->hlist, hhd);
hash->count++;
@@ -1383,9 +1389,8 @@ ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
struct ftrace_hash *new_hash);
-static int
-ftrace_hash_move(struct ftrace_ops *ops, int enable,
- struct ftrace_hash **dst, struct ftrace_hash *src)
+static struct ftrace_hash *
+__ftrace_hash_move(struct ftrace_hash *src)
{
struct ftrace_func_entry *entry;
struct hlist_node *tn;
@@ -1393,21 +1398,13 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash *new_hash;
int size = src->count;
int bits = 0;
- int ret;
int i;
- /* Reject setting notrace hash on IPMODIFY ftrace_ops */
- if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
- return -EINVAL;
-
/*
- * If the new source is empty, just free dst and assign it
- * the empty_hash.
+ * If the new source is empty, just return the empty_hash.
*/
- if (!src->count) {
- new_hash = EMPTY_HASH;
- goto update;
- }
+ if (!src->count)
+ return EMPTY_HASH;
/*
* Make the hash size about 1/2 the # found
@@ -1421,7 +1418,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
new_hash = alloc_ftrace_hash(bits);
if (!new_hash)
- return -ENOMEM;
+ return NULL;
size = 1 << src->size_bits;
for (i = 0; i < size; i++) {
@@ -1432,7 +1429,24 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
}
}
-update:
+ return new_hash;
+}
+
+static int
+ftrace_hash_move(struct ftrace_ops *ops, int enable,
+ struct ftrace_hash **dst, struct ftrace_hash *src)
+{
+ struct ftrace_hash *new_hash;
+ int ret;
+
+ /* Reject setting notrace hash on IPMODIFY ftrace_ops */
+ if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
+ return -EINVAL;
+
+ new_hash = __ftrace_hash_move(src);
+ if (!new_hash)
+ return -ENOMEM;
+
/* Make sure this can be applied if it is IPMODIFY ftrace_ops */
if (enable) {
/* IPMODIFY should be updated only when filter_hash updating */
@@ -1466,9 +1480,9 @@ static bool hash_contains_ip(unsigned long ip,
* notrace hash is considered not in the notrace hash.
*/
return (ftrace_hash_empty(hash->filter_hash) ||
- ftrace_lookup_ip(hash->filter_hash, ip)) &&
+ __ftrace_lookup_ip(hash->filter_hash, ip)) &&
(ftrace_hash_empty(hash->notrace_hash) ||
- !ftrace_lookup_ip(hash->notrace_hash, ip));
+ !__ftrace_lookup_ip(hash->notrace_hash, ip));
}
/*
@@ -2880,7 +2894,7 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
/* The function must be in the filter */
if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
- !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
+ !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
return 0;
/* If in notrace hash, we ignore it too */
@@ -4382,7 +4396,7 @@ __setup("ftrace_filter=", set_ftrace_filter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
-static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
+static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
static unsigned long save_global_trampoline;
static unsigned long save_global_flags;
@@ -4405,18 +4419,17 @@ static void __init set_ftrace_early_graph(char *buf, int enable)
{
int ret;
char *func;
- unsigned long *table = ftrace_graph_funcs;
- int *count = &ftrace_graph_count;
+ struct ftrace_hash *hash;
- if (!enable) {
- table = ftrace_graph_notrace_funcs;
- count = &ftrace_graph_notrace_count;
- }
+ if (enable)
+ hash = ftrace_graph_hash;
+ else
+ hash = ftrace_graph_notrace_hash;
while (buf) {
func = strsep(&buf, ",");
/* we allow only one expression at a time */
- ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func);
+ ret = ftrace_graph_set_hash(hash, func);
if (ret)
printk(KERN_DEBUG "ftrace: function %s not "
"traceable\n", func);
@@ -4540,26 +4553,55 @@ static const struct file_operations ftrace_notrace_fops = {
static DEFINE_MUTEX(graph_lock);
-int ftrace_graph_count;
-int ftrace_graph_notrace_count;
-unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
-unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
+struct ftrace_hash *ftrace_graph_hash = EMPTY_HASH;
+struct ftrace_hash *ftrace_graph_notrace_hash = EMPTY_HASH;
+
+enum graph_filter_type {
+ GRAPH_FILTER_NOTRACE = 0,
+ GRAPH_FILTER_FUNCTION,
+};
+
+#define FTRACE_GRAPH_EMPTY ((void *)1)
struct ftrace_graph_data {
- unsigned long *table;
- size_t size;
- int *count;
- const struct seq_operations *seq_ops;
+ struct ftrace_hash *hash;
+ struct ftrace_func_entry *entry;
+ int idx; /* for hash table iteration */
+ enum graph_filter_type type;
+ struct ftrace_hash *new_hash;
+ const struct seq_operations *seq_ops;
+ struct trace_parser parser;
};
static void *
__g_next(struct seq_file *m, loff_t *pos)
{
struct ftrace_graph_data *fgd = m->private;
+ struct ftrace_func_entry *entry = fgd->entry;
+ struct hlist_head *head;
+ int i, idx = fgd->idx;
- if (*pos >= *fgd->count)
+ if (*pos >= fgd->hash->count)
return NULL;
- return &fgd->table[*pos];
+
+ if (entry) {
+ hlist_for_each_entry_continue(entry, hlist) {
+ fgd->entry = entry;
+ return entry;
+ }
+
+ idx++;
+ }
+
+ for (i = idx; i < 1 << fgd->hash->size_bits; i++) {
+ head = &fgd->hash->buckets[i];
+ hlist_for_each_entry(entry, head, hlist) {
+ fgd->entry = entry;
+ fgd->idx = i;
+ return entry;
+ }
+ }
+ return NULL;
}
static void *
@@ -4575,10 +4617,19 @@ static void *g_start(struct seq_file *m, loff_t *pos)
mutex_lock(&graph_lock);
+ if (fgd->type == GRAPH_FILTER_FUNCTION)
+ fgd->hash = rcu_dereference_protected(ftrace_graph_hash,
+ lockdep_is_held(&graph_lock));
+ else
+ fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+ lockdep_is_held(&graph_lock));
+
/* Nothing, tell g_show to print all functions are enabled */
- if (!*fgd->count && !*pos)
- return (void *)1;
+ if (ftrace_hash_empty(fgd->hash) && !*pos)
+ return FTRACE_GRAPH_EMPTY;
+ fgd->idx = 0;
+ fgd->entry = NULL;
return __g_next(m, pos);
}
@@ -4589,22 +4640,22 @@ static void g_stop(struct seq_file *m, void *p)
static int g_show(struct seq_file *m, void *v)
{
- unsigned long *ptr = v;
+ struct ftrace_func_entry *entry = v;
- if (!ptr)
+ if (!entry)
return 0;
- if (ptr == (unsigned long *)1) {
+ if (entry == FTRACE_GRAPH_EMPTY) {
struct ftrace_graph_data *fgd = m->private;
- if (fgd->table == ftrace_graph_funcs)
+ if (fgd->type == GRAPH_FILTER_FUNCTION)
seq_puts(m, "#### all functions enabled ####\n");
else
seq_puts(m, "#### no functions disabled ####\n");
return 0;
}
- seq_printf(m, "%ps\n", (void *)*ptr);
+ seq_printf(m, "%ps\n", (void *)entry->ip);
return 0;
}
@@ -4621,24 +4672,51 @@ __ftrace_graph_open(struct inode *inode, struct file *file,
struct ftrace_graph_data *fgd)
{
int ret = 0;
+ struct ftrace_hash *new_hash = NULL;
- mutex_lock(&graph_lock);
- if ((file->f_mode & FMODE_WRITE) &&
- (file->f_flags & O_TRUNC)) {
- *fgd->count = 0;
- memset(fgd->table, 0, fgd->size * sizeof(*fgd->table));
+ if (file->f_mode & FMODE_WRITE) {
+ const int size_bits = FTRACE_HASH_DEFAULT_BITS;
+
+ if (trace_parser_get_init(&fgd->parser, FTRACE_BUFF_MAX))
+ return -ENOMEM;
+
+ if (file->f_flags & O_TRUNC)
+ new_hash = alloc_ftrace_hash(size_bits);
+ else
+ new_hash = alloc_and_copy_ftrace_hash(size_bits,
+ fgd->hash);
+ if (!new_hash) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
- mutex_unlock(&graph_lock);
if (file->f_mode & FMODE_READ) {
- ret = seq_open(file, fgd->seq_ops);
+ ret = seq_open(file, &ftrace_graph_seq_ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = fgd;
+ } else {
+ /* Failed */
+ free_ftrace_hash(new_hash);
+ new_hash = NULL;
}
} else
file->private_data = fgd;
+out:
+ if (ret < 0 && file->f_mode & FMODE_WRITE)
+ trace_parser_put(&fgd->parser);
+
+ fgd->new_hash = new_hash;
+
+ /*
+ * All uses of fgd->hash must be taken with the graph_lock
+ * held. The graph_lock is going to be released, so force
+ * fgd->hash to be reinitialized when it is taken again.
+ */
+ fgd->hash = NULL;
+
return ret;
}
@@ -4646,6 +4724,7 @@ static int
ftrace_graph_open(struct inode *inode, struct file *file)
{
struct ftrace_graph_data *fgd;
+ int ret;
if (unlikely(ftrace_disabled))
return -ENODEV;
@@ -4654,18 +4733,26 @@ ftrace_graph_open(struct inode *inode, struct file *file)
if (fgd == NULL)
return -ENOMEM;
- fgd->table = ftrace_graph_funcs;
- fgd->size = FTRACE_GRAPH_MAX_FUNCS;
- fgd->count = &ftrace_graph_count;
+ mutex_lock(&graph_lock);
+
+ fgd->hash = rcu_dereference_protected(ftrace_graph_hash,
+ lockdep_is_held(&graph_lock));
+ fgd->type = GRAPH_FILTER_FUNCTION;
fgd->seq_ops = &ftrace_graph_seq_ops;
- return __ftrace_graph_open(inode, file, fgd);
+ ret = __ftrace_graph_open(inode, file, fgd);
+ if (ret < 0)
+ kfree(fgd);
+
+ mutex_unlock(&graph_lock);
+ return ret;
}
static int
ftrace_graph_notrace_open(struct inode *inode, struct file *file)
{
struct ftrace_graph_data *fgd;
+ int ret;
if (unlikely(ftrace_disabled))
return -ENODEV;
@@ -4674,45 +4761,97 @@ ftrace_graph_notrace_open(struct inode *inode, struct file *file)
if (fgd == NULL)
return -ENOMEM;
- fgd->table = ftrace_graph_notrace_funcs;
- fgd->size = FTRACE_GRAPH_MAX_FUNCS;
- fgd->count = &ftrace_graph_notrace_count;
+ mutex_lock(&graph_lock);
+
+ fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+ lockdep_is_held(&graph_lock));
+ fgd->type = GRAPH_FILTER_NOTRACE;
fgd->seq_ops = &ftrace_graph_seq_ops;
- return __ftrace_graph_open(inode, file, fgd);
+ ret = __ftrace_graph_open(inode, file, fgd);
+ if (ret < 0)
+ kfree(fgd);
+
+ mutex_unlock(&graph_lock);
+ return ret;
}
static int
ftrace_graph_release(struct inode *inode, struct file *file)
{
+ struct ftrace_graph_data *fgd;
+ struct ftrace_hash *old_hash, *new_hash;
+ struct trace_parser *parser;
+ int ret = 0;
+
if (file->f_mode & FMODE_READ) {
struct seq_file *m = file->private_data;
- kfree(m->private);
+ fgd = m->private;
seq_release(inode, file);
} else {
- kfree(file->private_data);
+ fgd = file->private_data;
}
- return 0;
+
+ if (file->f_mode & FMODE_WRITE) {
+
+ parser = &fgd->parser;
+
+ if (trace_parser_loaded((parser))) {
+ parser->buffer[parser->idx] = 0;
+ ret = ftrace_graph_set_hash(fgd->new_hash,
+ parser->buffer);
+ }
+
+ trace_parser_put(parser);
+
+ new_hash = __ftrace_hash_move(fgd->new_hash);
+ if (!new_hash) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mutex_lock(&graph_lock);
+
+ if (fgd->type == GRAPH_FILTER_FUNCTION) {
+ old_hash = rcu_dereference_protected(ftrace_graph_hash,
+ lockdep_is_held(&graph_lock));
+ rcu_assign_pointer(ftrace_graph_hash, new_hash);
+ } else {
+ old_hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+ lockdep_is_held(&graph_lock));
+ rcu_assign_pointer(ftrace_graph_notrace_hash, new_hash);
+ }
+
+ mutex_unlock(&graph_lock);
+
+ /* Wait till all users are no longer using the old hash */
+ synchronize_sched();
+
+ free_ftrace_hash(old_hash);
+ }
+
+ out:
+ kfree(fgd->new_hash);
+ kfree(fgd);
+
+ return ret;
}
static int
-ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
+ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
{
struct ftrace_glob func_g;
struct dyn_ftrace *rec;
struct ftrace_page *pg;
+ struct ftrace_func_entry *entry;
int fail = 1;
int not;
- bool exists;
- int i;
/* decode regex */
func_g.type = filter_parse_regex(buffer, strlen(buffer),
&func_g.search, &not);
- if (!not && *idx >= size)
- return -EBUSY;
func_g.len = strlen(func_g.search);
@@ -4729,26 +4868,18 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
continue;
if (ftrace_match_record(rec, &func_g, NULL, 0)) {
- /* if it is in the array */
- exists = false;
- for (i = 0; i < *idx; i++) {
- if (array[i] == rec->ip) {
- exists = true;
- break;
- }
- }
+ entry = ftrace_lookup_ip(hash, rec->ip);
if (!not) {
fail = 0;
- if (!exists) {
- array[(*idx)++] = rec->ip;
- if (*idx >= size)
- goto out;
- }
+
+ if (entry)
+ continue;
+ if (add_hash_entry(hash, rec->ip) < 0)
+ goto out;
} else {
- if (exists) {
- array[i] = array[--(*idx)];
- array[*idx] = 0;
+ if (entry) {
+ free_hash_entry(hash, entry);
fail = 0;
}
}
@@ -4767,35 +4898,34 @@ static ssize_t
ftrace_graph_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- struct trace_parser parser;
ssize_t read, ret = 0;
struct ftrace_graph_data *fgd = file->private_data;
+ struct trace_parser *parser;
if (!cnt)
return 0;
- if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX))
- return -ENOMEM;
-
- read = trace_get_user(&parser, ubuf, cnt, ppos);
+ /* Read mode uses seq functions */
+ if (file->f_mode & FMODE_READ) {
+ struct seq_file *m = file->private_data;
+ fgd = m->private;
+ }
- if (read >= 0 && trace_parser_loaded((&parser))) {
- parser.buffer[parser.idx] = 0;
+ parser = &fgd->parser;
- mutex_lock(&graph_lock);
+ read = trace_get_user(parser, ubuf, cnt, ppos);
- /* we allow only one expression at a time */
- ret = ftrace_set_func(fgd->table, fgd->count, fgd->size,
- parser.buffer);
+ if (read >= 0 && trace_parser_loaded(parser) &&
+ !trace_parser_cont(parser)) {
- mutex_unlock(&graph_lock);
+ ret = ftrace_graph_set_hash(fgd->new_hash,
+ parser->buffer);
+ trace_parser_clear(parser);
}
if (!ret)
ret = read;
- trace_parser_put(&parser);
-
return ret;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 310f0ea0d1a2..707445ceb7ef 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -260,16 +260,8 @@ unsigned long long ns2usecs(u64 nsec)
TRACE_ITER_EVENT_FORK
/*
- * The global_trace is the descriptor that holds the tracing
- * buffers for the live tracing. For each CPU, it contains
- * a link list of pages that will store trace entries. The
- * page descriptor of the pages in the memory is used to hold
- * the link list by linking the lru item in the page descriptor
- * to each of the pages in the buffer per CPU.
- *
- * For each active CPU there is a data field that holds the
- * pages for the buffer for that CPU. Each CPU has the same number
- * of pages allocated for its buffer.
+ * The global_trace is the descriptor that holds the top-level tracing
+ * buffers for the live tracing.
*/
static struct trace_array global_trace = {
.trace_flags = TRACE_DEFAULT_FLAGS,
@@ -1193,6 +1185,7 @@ int trace_parser_get_init(struct trace_parser *parser, int size)
void trace_parser_put(struct trace_parser *parser)
{
kfree(parser->buffer);
+ parser->buffer = NULL;
}
/*
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1ea51ab53edf..ae1cce91fead 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -753,6 +753,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
extern char trace_find_mark(unsigned long long duration);
+struct ftrace_hash {
+ unsigned long size_bits;
+ struct hlist_head *buckets;
+ unsigned long count;
+ struct rcu_head rcu;
+};
+
+struct ftrace_func_entry *
+ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip);
+
+static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
+{
+ return !hash || !hash->count;
+}
+
/* Standard output formatting function used for function return traces */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -787,53 +802,50 @@ extern void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
unsigned long flags, int pc);
-
#ifdef CONFIG_DYNAMIC_FTRACE
-/* TODO: make this variable */
-#define FTRACE_GRAPH_MAX_FUNCS 32
-extern int ftrace_graph_count;
-extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
-extern int ftrace_graph_notrace_count;
-extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS];
+extern struct ftrace_hash *ftrace_graph_hash;
+extern struct ftrace_hash *ftrace_graph_notrace_hash;
static inline int ftrace_graph_addr(unsigned long addr)
{
- int i;
-
- if (!ftrace_graph_count)
- return 1;
-
- for (i = 0; i < ftrace_graph_count; i++) {
- if (addr == ftrace_graph_funcs[i]) {
- /*
- * If no irqs are to be traced, but a set_graph_function
- * is set, and called by an interrupt handler, we still
- * want to trace it.
- */
- if (in_irq())
- trace_recursion_set(TRACE_IRQ_BIT);
- else
- trace_recursion_clear(TRACE_IRQ_BIT);
- return 1;
- }
+ int ret = 0;
+
+ preempt_disable_notrace();
+
+ if (ftrace_hash_empty(ftrace_graph_hash)) {
+ ret = 1;
+ goto out;
}
- return 0;
+ if (ftrace_lookup_ip(ftrace_graph_hash, addr)) {
+ /*
+ * If no irqs are to be traced, but a set_graph_function
+ * is set, and called by an interrupt handler, we still
+ * want to trace it.
+ */
+ if (in_irq())
+ trace_recursion_set(TRACE_IRQ_BIT);
+ else
+ trace_recursion_clear(TRACE_IRQ_BIT);
+ ret = 1;
+ }
+
+out:
+ preempt_enable_notrace();
+ return ret;
}
static inline int ftrace_graph_notrace_addr(unsigned long addr)
{
- int i;
+ int ret = 0;
- if (!ftrace_graph_notrace_count)
- return 0;
+ preempt_disable_notrace();
- for (i = 0; i < ftrace_graph_notrace_count; i++) {
- if (addr == ftrace_graph_notrace_funcs[i])
- return 1;
- }
+ if (ftrace_lookup_ip(ftrace_graph_notrace_hash, addr))
+ ret = 1;
- return 0;
+ preempt_enable_notrace();
+ return ret;
}
#else
static inline int ftrace_graph_addr(unsigned long addr)
@@ -1300,7 +1312,8 @@ static inline bool is_string_field(struct ftrace_event_field *field)
{
return field->filter_type == FILTER_DYN_STRING ||
field->filter_type == FILTER_STATIC_STRING ||
- field->filter_type == FILTER_PTR_STRING;
+ field->filter_type == FILTER_PTR_STRING ||
+ field->filter_type == FILTER_COMM;
}
static inline bool is_function_field(struct ftrace_event_field *field)
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index e3b488825ae3..e49fbe901cfc 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -175,9 +175,9 @@ int trace_benchmark_reg(void)
bm_event_thread = kthread_run(benchmark_event_kthread,
NULL, "event_benchmark");
- if (!bm_event_thread) {
+ if (IS_ERR(bm_event_thread)) {
pr_warning("trace benchmark failed to create kernel thread\n");
- return -ENOMEM;
+ return PTR_ERR(bm_event_thread);
}
return 0;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 75489de546b6..4d8fdf3184dc 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -27,7 +27,7 @@ static DEFINE_MUTEX(branch_tracing_mutex);
static struct trace_array *branch_tracer;
static void
-probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
{
struct trace_event_call *call = &event_branch;
struct trace_array *tr = branch_tracer;
@@ -68,16 +68,17 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
entry = ring_buffer_event_data(event);
/* Strip off the path, only save the file */
- p = f->file + strlen(f->file);
- while (p >= f->file && *p != '/')
+ p = f->data.file + strlen(f->data.file);
+ while (p >= f->data.file && *p != '/')
p--;
p++;
- strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
+ strncpy(entry->func, f->data.func, TRACE_FUNC_SIZE);
strncpy(entry->file, p, TRACE_FILE_SIZE);
entry->func[TRACE_FUNC_SIZE] = 0;
entry->file[TRACE_FILE_SIZE] = 0;
- entry->line = f->line;
+ entry->constant = f->constant;
+ entry->line = f->data.line;
entry->correct = val == expect;
if (!call_filter_check_discard(call, entry, buffer, event))
@@ -89,7 +90,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
}
static inline
-void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
{
if (!branch_tracing_enabled)
return;
@@ -195,13 +196,19 @@ core_initcall(init_branch_tracer);
#else
static inline
-void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
{
}
#endif /* CONFIG_BRANCH_TRACER */
-void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
+void ftrace_likely_update(struct ftrace_likely_data *f, int val,
+ int expect, int is_constant)
{
+ /* A constant is always correct */
+ if (is_constant) {
+ f->constant++;
+ val = expect;
+ }
/*
* I would love to have a trace point here instead, but the
* trace point code is so inundated with unlikely and likely
@@ -212,9 +219,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
/* FIXME: Make this atomic! */
if (val == expect)
- f->correct++;
+ f->data.correct++;
else
- f->incorrect++;
+ f->data.incorrect++;
}
EXPORT_SYMBOL(ftrace_likely_update);
@@ -245,29 +252,60 @@ static inline long get_incorrect_percent(struct ftrace_branch_data *p)
return percent;
}
-static int branch_stat_show(struct seq_file *m, void *v)
+static const char *branch_stat_process_file(struct ftrace_branch_data *p)
{
- struct ftrace_branch_data *p = v;
const char *f;
- long percent;
/* Only print the file, not the path */
f = p->file + strlen(p->file);
while (f >= p->file && *f != '/')
f--;
- f++;
+ return ++f;
+}
+
+static void branch_stat_show(struct seq_file *m,
+ struct ftrace_branch_data *p, const char *f)
+{
+ long percent;
/*
* The miss is overlayed on correct, and hit on incorrect.
*/
percent = get_incorrect_percent(p);
- seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
if (percent < 0)
seq_puts(m, " X ");
else
seq_printf(m, "%3ld ", percent);
+
seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
+}
+
+static int branch_stat_show_normal(struct seq_file *m,
+ struct ftrace_branch_data *p, const char *f)
+{
+ seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
+ branch_stat_show(m, p, f);
+ return 0;
+}
+
+static int annotate_branch_stat_show(struct seq_file *m, void *v)
+{
+ struct ftrace_likely_data *p = v;
+ const char *f;
+ int l;
+
+ f = branch_stat_process_file(&p->data);
+
+ if (!p->constant)
+ return branch_stat_show_normal(m, &p->data, f);
+
+ l = snprintf(NULL, 0, "/%lu", p->constant);
+ l = l > 8 ? 0 : 8 - l;
+
+ seq_printf(m, "%8lu/%lu %*lu ",
+ p->data.correct, p->constant, l, p->data.incorrect);
+ branch_stat_show(m, &p->data, f);
return 0;
}
@@ -279,7 +317,7 @@ static void *annotated_branch_stat_start(struct tracer_stat *trace)
static void *
annotated_branch_stat_next(void *v, int idx)
{
- struct ftrace_branch_data *p = v;
+ struct ftrace_likely_data *p = v;
++p;
@@ -328,7 +366,7 @@ static struct tracer_stat annotated_branch_stats = {
.stat_next = annotated_branch_stat_next,
.stat_cmp = annotated_branch_stat_cmp,
.stat_headers = annotated_branch_stat_headers,
- .stat_show = branch_stat_show
+ .stat_show = annotate_branch_stat_show
};
__init static int init_annotated_branch_stats(void)
@@ -379,12 +417,21 @@ all_branch_stat_next(void *v, int idx)
return p;
}
+static int all_branch_stat_show(struct seq_file *m, void *v)
+{
+ struct ftrace_branch_data *p = v;
+ const char *f;
+
+ f = branch_stat_process_file(p);
+ return branch_stat_show_normal(m, p, f);
+}
+
static struct tracer_stat all_branch_stats = {
.name = "branch_all",
.stat_start = all_branch_stat_start,
.stat_next = all_branch_stat_next,
.stat_headers = all_branch_stat_headers,
- .stat_show = branch_stat_show
+ .stat_show = all_branch_stat_show
};
__init static int all_annotated_branch_stats(void)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index eb7396b7e7c3..c203ac4df791 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -328,11 +328,13 @@ FTRACE_ENTRY(branch, trace_branch,
__array( char, func, TRACE_FUNC_SIZE+1 )
__array( char, file, TRACE_FILE_SIZE+1 )
__field( char, correct )
+ __field( char, constant )
),
- F_printk("%u:%s:%s (%u)",
+ F_printk("%u:%s:%s (%u)%s",
__entry->line,
- __entry->func, __entry->file, __entry->correct),
+ __entry->func, __entry->file, __entry->correct,
+ __entry->constant ? " CONSTANT" : ""),
FILTER_OTHER
);
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index af344a1bf0d0..edfacd954e1b 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -266,24 +266,13 @@ out:
static struct cpumask save_cpumask;
static bool disable_migrate;
-static void move_to_next_cpu(bool initmask)
+static void move_to_next_cpu(void)
{
- static struct cpumask *current_mask;
+ struct cpumask *current_mask = &save_cpumask;
int next_cpu;
if (disable_migrate)
return;
-
- /* Just pick the first CPU on first iteration */
- if (initmask) {
- current_mask = &save_cpumask;
- get_online_cpus();
- cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
- put_online_cpus();
- next_cpu = cpumask_first(current_mask);
- goto set_affinity;
- }
-
/*
* If for some reason the user modifies the CPU affinity
* of this thread, than stop migrating for the duration
@@ -300,7 +289,6 @@ static void move_to_next_cpu(bool initmask)
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(current_mask);
- set_affinity:
if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
goto disable;
@@ -322,20 +310,15 @@ static void move_to_next_cpu(bool initmask)
* need to ensure nothing else might be running (and thus preempting).
* Obviously this should never be used in production environments.
*
- * Currently this runs on which ever CPU it was scheduled on, but most
- * real-world hardware latency situations occur across several CPUs,
- * but we might later generalize this if we find there are any actualy
- * systems with alternate SMI delivery or other hardware latencies.
+ * Executes one loop interaction on each CPU in tracing_cpumask sysfs file.
*/
static int kthread_fn(void *data)
{
u64 interval;
- bool initmask = true;
while (!kthread_should_stop()) {
- move_to_next_cpu(initmask);
- initmask = false;
+ move_to_next_cpu();
local_irq_disable();
get_sample();
@@ -366,13 +349,27 @@ static int kthread_fn(void *data)
*/
static int start_kthread(struct trace_array *tr)
{
+ struct cpumask *current_mask = &save_cpumask;
struct task_struct *kthread;
+ int next_cpu;
+
+ /* Just pick the first CPU on first iteration */
+ current_mask = &save_cpumask;
+ get_online_cpus();
+ cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+ put_online_cpus();
+ next_cpu = cpumask_first(current_mask);
kthread = kthread_create(kthread_fn, NULL, "hwlatd");
if (IS_ERR(kthread)) {
pr_err(BANNER "could not start sampling thread\n");
return -ENOMEM;
}
+
+ cpumask_clear(current_mask);
+ cpumask_set_cpu(next_cpu, current_mask);
+ sched_setaffinity(kthread->pid, current_mask);
+
hwlat_kthread = kthread;
wake_up_process(kthread);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 7ad9e53ad174..eadd96ef772f 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -16,6 +16,7 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#define pr_fmt(fmt) "trace_kprobe: " fmt
#include <linux/module.h>
#include <linux/uaccess.h>
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 8c0553d9afd3..52478f033f88 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -21,6 +21,7 @@
* Copyright (C) IBM Corporation, 2010-2011
* Author: Srikar Dronamraju
*/
+#define pr_fmt(fmt) "trace_probe: " fmt
#include "trace_probe.h"
@@ -647,7 +648,7 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos,
int (*createfn)(int, char **))
{
- char *kbuf, *tmp;
+ char *kbuf, *buf, *tmp;
int ret = 0;
size_t done = 0;
size_t size;
@@ -667,27 +668,38 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
goto out;
}
kbuf[size] = '\0';
- tmp = strchr(kbuf, '\n');
+ buf = kbuf;
+ do {
+ tmp = strchr(buf, '\n');
+ if (tmp) {
+ *tmp = '\0';
+ size = tmp - buf + 1;
+ } else {
+ size = strlen(buf);
+ if (done + size < count) {
+ if (buf != kbuf)
+ break;
+ /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
+ pr_warn("Line length is too long: Should be less than %d\n",
+ WRITE_BUFSIZE - 2);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ done += size;
- if (tmp) {
- *tmp = '\0';
- size = tmp - kbuf + 1;
- } else if (done + size < count) {
- pr_warn("Line length is too long: Should be less than %d\n",
- WRITE_BUFSIZE);
- ret = -EINVAL;
- goto out;
- }
- done += size;
- /* Remove comments */
- tmp = strchr(kbuf, '#');
+ /* Remove comments */
+ tmp = strchr(buf, '#');
- if (tmp)
- *tmp = '\0';
+ if (tmp)
+ *tmp = '\0';
- ret = traceprobe_command(kbuf, createfn);
- if (ret)
- goto out;
+ ret = traceprobe_command(buf, createfn);
+ if (ret)
+ goto out;
+ buf += size;
+
+ } while (done < count);
}
ret = done;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 0913693caf6e..f4379e772171 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -17,6 +17,7 @@
* Copyright (C) IBM Corporation, 2010-2012
* Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
*/
+#define pr_fmt(fmt) "trace_kprobe: " fmt
#include <linux/module.h>
#include <linux/uaccess.h>
@@ -431,7 +432,8 @@ static int create_trace_uprobe(int argc, char **argv)
pr_info("Probe point is not specified.\n");
return -EINVAL;
}
- arg = strchr(argv[1], ':');
+ /* Find the last occurrence, in case the path contains ':' too. */
+ arg = strrchr(argv[1], ':');
if (!arg) {
ret = -EINVAL;
goto fail_address_parse;
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index c8cebb137076..9c21000df0b5 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -176,13 +176,12 @@ static int percpu_counter_cpu_dead(unsigned int cpu)
spin_lock_irq(&percpu_counters_lock);
list_for_each_entry(fbc, &percpu_counters, list) {
s32 *pcount;
- unsigned long flags;
- raw_spin_lock_irqsave(&fbc->lock, flags);
+ raw_spin_lock(&fbc->lock);
pcount = per_cpu_ptr(fbc->counters, cpu);
fbc->count += *pcount;
*pcount = 0;
- raw_spin_unlock_irqrestore(&fbc->lock, flags);
+ raw_spin_unlock(&fbc->lock);
}
spin_unlock_irq(&percpu_counters_lock);
#endif
diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
index 8fdee24725a7..eafbf11442b2 100644
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -12,8 +12,8 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
{
FILE *fp;
char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1];
+ char path_v1[PATH_MAX + 1], path_v2[PATH_MAX + 2], *path;
char *token, *saved_ptr = NULL;
- int found = 0;
fp = fopen("/proc/mounts", "r");
if (!fp)
@@ -24,31 +24,43 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
* and inspect every cgroupfs mount point to find one that has
* perf_event subsystem
*/
+ path_v1[0] = '\0';
+ path_v2[0] = '\0';
+
while (fscanf(fp, "%*s %"STR(PATH_MAX)"s %"STR(PATH_MAX)"s %"
STR(PATH_MAX)"s %*d %*d\n",
mountpoint, type, tokens) == 3) {
- if (!strcmp(type, "cgroup")) {
+ if (!path_v1[0] && !strcmp(type, "cgroup")) {
token = strtok_r(tokens, ",", &saved_ptr);
while (token != NULL) {
if (!strcmp(token, "perf_event")) {
- found = 1;
+ strcpy(path_v1, mountpoint);
break;
}
token = strtok_r(NULL, ",", &saved_ptr);
}
}
- if (found)
+
+ if (!path_v2[0] && !strcmp(type, "cgroup2"))
+ strcpy(path_v2, mountpoint);
+
+ if (path_v1[0] && path_v2[0])
break;
}
fclose(fp);
- if (!found)
+
+ if (path_v1[0])
+ path = path_v1;
+ else if (path_v2[0])
+ path = path_v2;
+ else
return -1;
- if (strlen(mountpoint) < maxlen) {
- strcpy(buf, mountpoint);
+ if (strlen(path) < maxlen) {
+ strcpy(buf, path);
return 0;
}
return -1;
diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index be93ab02b490..6e4eb2fc2d1e 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -179,6 +179,7 @@ my $localversion;
my $iteration = 0;
my $successes = 0;
my $stty_orig;
+my $run_command_status = 0;
my $bisect_good;
my $bisect_bad;
@@ -1325,26 +1326,44 @@ sub wait_for_monitor;
sub reboot {
my ($time) = @_;
+ my $powercycle = 0;
- # Make sure everything has been written to disk
- run_ssh("sync");
+ # test if the machine can be connected to within 5 seconds
+ my $stat = run_ssh("echo check machine status", 5);
+ if (!$stat) {
+ doprint("power cycle\n");
+ $powercycle = 1;
+ }
+
+ if ($powercycle) {
+ run_command "$power_cycle";
- if (defined($time)) {
start_monitor;
# flush out current monitor
# May contain the reboot success line
wait_for_monitor 1;
- }
- # try to reboot normally
- if (run_command $reboot) {
- if (defined($powercycle_after_reboot)) {
- sleep $powercycle_after_reboot;
+ } else {
+ # Make sure everything has been written to disk
+ run_ssh("sync");
+
+ if (defined($time)) {
+ start_monitor;
+ # flush out current monitor
+ # May contain the reboot success line
+ wait_for_monitor 1;
+ }
+
+ # try to reboot normally
+ if (run_command $reboot) {
+ if (defined($powercycle_after_reboot)) {
+ sleep $powercycle_after_reboot;
+ run_command "$power_cycle";
+ }
+ } else {
+ # nope? power cycle it.
run_command "$power_cycle";
}
- } else {
- # nope? power cycle it.
- run_command "$power_cycle";
}
if (defined($time)) {
@@ -1412,6 +1431,10 @@ sub dodie {
system("stty $stty_orig");
}
+ if (defined($post_test)) {
+ run_command $post_test;
+ }
+
die @_, "\n";
}
@@ -1624,10 +1647,6 @@ sub save_logs {
sub fail {
- if (defined($post_test)) {
- run_command $post_test;
- }
-
if ($die_on_failure) {
dodie @_;
}
@@ -1660,23 +1679,26 @@ sub fail {
save_logs "fail", $store_failures;
}
+ if (defined($post_test)) {
+ run_command $post_test;
+ }
+
return 1;
}
sub run_command {
- my ($command, $redirect) = @_;
+ my ($command, $redirect, $timeout) = @_;
my $start_time;
my $end_time;
my $dolog = 0;
my $dord = 0;
my $pid;
- $start_time = time;
-
$command =~ s/\$SSH_USER/$ssh_user/g;
$command =~ s/\$MACHINE/$machine/g;
doprint("$command ... ");
+ $start_time = time;
$pid = open(CMD, "$command 2>&1 |") or
(fail "unable to exec $command" and return 0);
@@ -1693,13 +1715,30 @@ sub run_command {
$dord = 1;
}
- while (<CMD>) {
- print LOG if ($dolog);
- print RD if ($dord);
+ my $hit_timeout = 0;
+
+ while (1) {
+ my $fp = \*CMD;
+ if (defined($timeout)) {
+ doprint "timeout = $timeout\n";
+ }
+ my $line = wait_for_input($fp, $timeout);
+ if (!defined($line)) {
+ my $now = time;
+ if (defined($timeout) && (($now - $start_time) >= $timeout)) {
+ doprint "Hit timeout of $timeout, killing process\n";
+ $hit_timeout = 1;
+ kill 9, $pid;
+ }
+ last;
+ }
+ print LOG $line if ($dolog);
+ print RD $line if ($dord);
}
waitpid($pid, 0);
- my $failed = $?;
+ # shift 8 for real exit status
+ $run_command_status = $? >> 8;
close(CMD);
close(LOG) if ($dolog);
@@ -1714,21 +1753,25 @@ sub run_command {
doprint "[$delta seconds] ";
}
- if ($failed) {
+ if ($hit_timeout) {
+ $run_command_status = 1;
+ }
+
+ if ($run_command_status) {
doprint "FAILED!\n";
} else {
doprint "SUCCESS\n";
}
- return !$failed;
+ return !$run_command_status;
}
sub run_ssh {
- my ($cmd) = @_;
+ my ($cmd, $timeout) = @_;
my $cp_exec = $ssh_exec;
$cp_exec =~ s/\$SSH_COMMAND/$cmd/g;
- return run_command "$cp_exec";
+ return run_command "$cp_exec", undef , $timeout;
}
sub run_scp {
@@ -2489,10 +2532,6 @@ sub halt {
sub success {
my ($i) = @_;
- if (defined($post_test)) {
- run_command $post_test;
- }
-
$successes++;
my $name = "";
@@ -2517,6 +2556,10 @@ sub success {
doprint "Reboot and wait $sleep_time seconds\n";
reboot_to_good $sleep_time;
}
+
+ if (defined($post_test)) {
+ run_command $post_test;
+ }
}
sub answer_bisect {
@@ -2537,16 +2580,15 @@ sub answer_bisect {
}
sub child_run_test {
- my $failed = 0;
# child should have no power
$reboot_on_error = 0;
$poweroff_on_error = 0;
$die_on_failure = 1;
- run_command $run_test, $testlog or $failed = 1;
+ run_command $run_test, $testlog;
- exit $failed;
+ exit $run_command_status;
}
my $child_done;
@@ -2629,7 +2671,7 @@ sub do_run_test {
}
waitpid $child_pid, 0;
- $child_exit = $?;
+ $child_exit = $? >> 8;
my $end_time = time;
$test_time = $end_time - $start_time;
@@ -3330,7 +3372,6 @@ sub config_bisect {
save_config \%good_configs, $good_config;
save_config \%bad_configs, $bad_config;
-
if (defined($config_bisect_check) && $config_bisect_check ne "0") {
if ($config_bisect_check ne "good") {
doprint "Testing bad config\n";