diff options
515 files changed, 33282 insertions, 7565 deletions
diff --git a/Documentation/devicetree/bindings/input/rotary-encoder.txt b/Documentation/devicetree/bindings/input/rotary-encoder.txt index 6c9f0c8a846c..e85ce3dea480 100644 --- a/Documentation/devicetree/bindings/input/rotary-encoder.txt +++ b/Documentation/devicetree/bindings/input/rotary-encoder.txt @@ -20,6 +20,8 @@ Optional properties: 2: Half-period mode 4: Quarter-period mode - wakeup-source: Boolean, rotary encoder can wake up the system. +- rotary-encoder,encoding: String, the method used to encode steps. + Supported are "gray" (the default and more common) and "binary". Deprecated properties: - rotary-encoder,half-period: Makes the driver work on half-period mode. @@ -34,6 +36,7 @@ Example: compatible = "rotary-encoder"; gpios = <&gpio 19 1>, <&gpio 20 0>; /* GPIO19 is inverted */ linux,axis = <0>; /* REL_X */ + rotary-encoder,encoding = "gray"; rotary-encoder,relative-axis; }; @@ -42,5 +45,6 @@ Example: gpios = <&gpio 21 0>, <&gpio 22 0>; linux,axis = <1>; /* ABS_Y */ rotary-encoder,steps = <24>; + rotary-encoder,encoding = "binary"; rotary-encoder,rollover; }; diff --git a/Documentation/devicetree/bindings/input/touchscreen/silead_gsl1680.txt b/Documentation/devicetree/bindings/input/touchscreen/silead_gsl1680.txt new file mode 100644 index 000000000000..1112e0d794e1 --- /dev/null +++ b/Documentation/devicetree/bindings/input/touchscreen/silead_gsl1680.txt @@ -0,0 +1,36 @@ +* GSL 1680 touchscreen controller + +Required properties: +- compatible : "silead,gsl1680" +- reg : I2C slave address of the chip (0x40) +- interrupt-parent : a phandle pointing to the interrupt controller + serving the interrupt for this chip +- interrupts : interrupt specification for the gsl1680 interrupt +- power-gpios : Specification for the pin connected to the gsl1680's + shutdown input. This needs to be driven high to take the + gsl1680 out of its low power state +- touchscreen-size-x : See touchscreen.txt +- touchscreen-size-y : See touchscreen.txt + +Optional properties: +- touchscreen-inverted-x : See touchscreen.txt +- touchscreen-inverted-y : See touchscreen.txt +- touchscreen-swapped-x-y : See touchscreen.txt +- silead,max-fingers : maximum number of fingers the touchscreen can detect + +Example: + +i2c@00000000 { + gsl1680: touchscreen@40 { + compatible = "silead,gsl1680"; + reg = <0x40>; + interrupt-parent = <&pio>; + interrupts = <6 11 IRQ_TYPE_EDGE_FALLING>; + power-gpios = <&pio 1 3 GPIO_ACTIVE_HIGH>; + touchscreen-size-x = <480>; + touchscreen-size-y = <800>; + touchscreen-inverted-x; + touchscreen-swapped-x-y; + silead,max-fingers = <5>; + }; +}; diff --git a/Documentation/devicetree/bindings/input/touchscreen/sis_i2c.txt b/Documentation/devicetree/bindings/input/touchscreen/sis_i2c.txt new file mode 100644 index 000000000000..d87ad14f1efe --- /dev/null +++ b/Documentation/devicetree/bindings/input/touchscreen/sis_i2c.txt @@ -0,0 +1,33 @@ +* SiS I2C Multiple Touch Controller + +Required properties: +- compatible: must be "sis,9200-ts" +- reg: i2c slave address +- interrupt-parent: the phandle for the interrupt controller + (see interrupt binding [0]) +- interrupts: touch controller interrupt (see interrupt + binding [0]) + +Optional properties: +- pinctrl-names: should be "default" (see pinctrl binding [1]). +- pinctrl-0: a phandle pointing to the pin settings for the + device (see pinctrl binding [1]). +- attn-gpios: the gpio pin used as attention line +- reset-gpios: the gpio pin used to reset the controller +- wakeup-source: touchscreen can be used as a wakeup source + +[0]: Documentation/devicetree/bindings/interrupt-controller/interrupts.txt +[1]: Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt + +Example: + + sis9255@5c { + compatible = "sis,9200-ts"; + reg = <0x5c>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_sis>; + interrupt-parent = <&gpio3>; + interrupts = <19 IRQ_TYPE_EDGE_FALLING>; + irq-gpios = <&gpio3 19 GPIO_ACTIVE_LOW>; + reset-gpios = <&gpio2 30 GPIO_ACTIVE_LOW>; + }; diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt index 68391a462c0a..1992aa97d45a 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.txt +++ b/Documentation/devicetree/bindings/vendor-prefixes.txt @@ -238,6 +238,7 @@ simtek sii Seiko Instruments, Inc. silergy Silergy Corp. sirf SiRF Technology, Inc. +sis Silicon Integrated Systems Corp. sitronix Sitronix Technology Corporation skyworks Skyworks Solutions, Inc. smsc Standard Microsystems Corporation diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 1a855d0c11fa..eb0a0582d912 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3877,6 +3877,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. using these two parameters to set the minimum and maximum port values. + sunrpc.svc_rpc_per_connection_limit= + [NFS,SUNRPC] + Limit the number of requests that the server will + process in parallel from a single connection. + The default value is 0 (no limit). + sunrpc.pool_mode= [NFS] Control how the NFS server code allocates CPUs to diff --git a/MAINTAINERS b/MAINTAINERS index bafc8043d4f0..e9c75275405d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3219,7 +3219,7 @@ M: Johannes Weiner <hannes@cmpxchg.org> L: cgroups@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git S: Maintained -F: Documentation/cgroups/ +F: Documentation/cgroup* F: include/linux/cgroup* F: kernel/cgroup* @@ -3230,7 +3230,7 @@ W: http://www.bullopensource.org/cpuset/ W: http://oss.sgi.com/projects/cpusets/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git S: Maintained -F: Documentation/cgroups/cpusets.txt +F: Documentation/cgroup-v1/cpusets.txt F: include/linux/cpuset.h F: kernel/cpuset.c @@ -5831,7 +5831,15 @@ M: Tyrel Datwyler <tyreld@linux.vnet.ibm.com> L: linux-scsi@vger.kernel.org S: Supported F: drivers/scsi/ibmvscsi/ibmvscsi* -F: drivers/scsi/ibmvscsi/viosrp.h +F: include/scsi/viosrp.h + +IBM Power Virtual SCSI Device Target Driver +M: Bryant G. Ly <bryantly@linux.vnet.ibm.com> +M: Michael Cyr <mikecyr@linux.vnet.ibm.com> +L: linux-scsi@vger.kernel.org +L: target-devel@vger.kernel.org +S: Supported +F: drivers/scsi/ibmvscsi_tgt/ IBM Power Virtual FC Device Drivers M: Tyrel Datwyler <tyreld@linux.vnet.ibm.com> @@ -7639,6 +7647,15 @@ W: http://www.mellanox.com Q: http://patchwork.ozlabs.org/project/netdev/list/ F: drivers/net/ethernet/mellanox/mlxsw/ +SOFT-ROCE DRIVER (rxe) +M: Moni Shoua <monis@mellanox.com> +L: linux-rdma@vger.kernel.org +S: Supported +W: https://github.com/SoftRoCE/rxe-dev/wiki/rxe-dev:-Home +Q: http://patchwork.kernel.org/project/linux-rdma/list/ +F: drivers/infiniband/hw/rxe/ +F: include/uapi/rdma/rdma_user_rxe.h + MEMBARRIER SUPPORT M: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> @@ -9811,10 +9828,14 @@ L: rtc-linux@googlegroups.com Q: http://patchwork.ozlabs.org/project/rtc-linux/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git S: Maintained +F: Documentation/devicetree/bindings/rtc/ F: Documentation/rtc.txt F: drivers/rtc/ F: include/linux/rtc.h F: include/uapi/linux/rtc.h +F: include/linux/rtc/ +F: include/linux/platform_data/rtc-* +F: tools/testing/selftests/timers/rtctest.c REALTEK AUDIO CODECS M: Bard Liao <bardliao@realtek.com> diff --git a/arch/alpha/include/asm/rtc.h b/arch/alpha/include/asm/rtc.h deleted file mode 100644 index f71c3b0ed360..000000000000 --- a/arch/alpha/include/asm/rtc.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/rtc.h> diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c index 53dd2f1a53aa..d5f0580746a5 100644 --- a/arch/alpha/kernel/core_marvel.c +++ b/arch/alpha/kernel/core_marvel.c @@ -24,7 +24,6 @@ #include <asm/gct.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> -#include <asm/rtc.h> #include <asm/vga.h> #include "proto.h" diff --git a/arch/alpha/kernel/rtc.c b/arch/alpha/kernel/rtc.c index f535a3fd0f60..ceed68c7500b 100644 --- a/arch/alpha/kernel/rtc.c +++ b/arch/alpha/kernel/rtc.c @@ -15,8 +15,6 @@ #include <linux/rtc.h> #include <linux/platform_device.h> -#include <asm/rtc.h> - #include "proto.h" @@ -81,7 +79,7 @@ init_rtc_epoch(void) static int alpha_rtc_read_time(struct device *dev, struct rtc_time *tm) { - __get_rtc_time(tm); + mc146818_get_time(tm); /* Adjust for non-default epochs. It's easier to depend on the generic __get_rtc_time and adjust the epoch here than create @@ -112,7 +110,7 @@ alpha_rtc_set_time(struct device *dev, struct rtc_time *tm) tm = &xtm; } - return __set_rtc_time(tm); + return mc146818_set_time(tm); } static int diff --git a/arch/arm/mach-ep93xx/ts72xx.c b/arch/arm/mach-ep93xx/ts72xx.c index 45b81a2bcd4b..3b39ea353d30 100644 --- a/arch/arm/mach-ep93xx/ts72xx.c +++ b/arch/arm/mach-ep93xx/ts72xx.c @@ -16,7 +16,7 @@ #include <linux/init.h> #include <linux/platform_device.h> #include <linux/io.h> -#include <linux/m48t86.h> +#include <linux/platform_data/rtc-m48t86.h> #include <linux/mtd/nand.h> #include <linux/mtd/partitions.h> diff --git a/arch/arm/mach-orion5x/ts78xx-setup.c b/arch/arm/mach-orion5x/ts78xx-setup.c index 3a58a5d4a28a..8d597267d0c4 100644 --- a/arch/arm/mach-orion5x/ts78xx-setup.c +++ b/arch/arm/mach-orion5x/ts78xx-setup.c @@ -16,7 +16,7 @@ #include <linux/platform_device.h> #include <linux/mv643xx_eth.h> #include <linux/ata_platform.h> -#include <linux/m48t86.h> +#include <linux/platform_data/rtc-m48t86.h> #include <linux/mtd/nand.h> #include <linux/mtd/partitions.h> #include <linux/timeriomem-rng.h> diff --git a/arch/arm/mach-pxa/cm-x270.c b/arch/arm/mach-pxa/cm-x270.c index fa5f51d633a3..be4a66166d61 100644 --- a/arch/arm/mach-pxa/cm-x270.c +++ b/arch/arm/mach-pxa/cm-x270.c @@ -14,7 +14,7 @@ #include <linux/gpio.h> #include <linux/delay.h> -#include <linux/rtc-v3020.h> +#include <linux/platform_data/rtc-v3020.h> #include <video/mbxfb.h> #include <linux/spi/spi.h> diff --git a/arch/arm/mach-pxa/cm-x300.c b/arch/arm/mach-pxa/cm-x300.c index 5f5ac7c8faf0..868448d2cd82 100644 --- a/arch/arm/mach-pxa/cm-x300.c +++ b/arch/arm/mach-pxa/cm-x300.c @@ -25,7 +25,7 @@ #include <linux/gpio.h> #include <linux/dm9000.h> #include <linux/leds.h> -#include <linux/rtc-v3020.h> +#include <linux/platform_data/rtc-v3020.h> #include <linux/pwm.h> #include <linux/pwm_backlight.h> diff --git a/arch/arm/mach-pxa/em-x270.c b/arch/arm/mach-pxa/em-x270.c index 6e0268deec43..03354c21e1f2 100644 --- a/arch/arm/mach-pxa/em-x270.c +++ b/arch/arm/mach-pxa/em-x270.c @@ -14,7 +14,7 @@ #include <linux/delay.h> #include <linux/dm9000.h> -#include <linux/rtc-v3020.h> +#include <linux/platform_data/rtc-v3020.h> #include <linux/mtd/nand.h> #include <linux/mtd/partitions.h> #include <linux/mtd/physmap.h> diff --git a/arch/frv/include/asm/mc146818rtc.h b/arch/frv/include/asm/mc146818rtc.h deleted file mode 100644 index 90dfb7a633d1..000000000000 --- a/arch/frv/include/asm/mc146818rtc.h +++ /dev/null @@ -1,16 +0,0 @@ -/* mc146818rtc.h: RTC defs - * - * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _ASM_MC146818RTC_H -#define _ASM_MC146818RTC_H - - -#endif /* _ASM_MC146818RTC_H */ diff --git a/arch/h8300/include/asm/mc146818rtc.h b/arch/h8300/include/asm/mc146818rtc.h deleted file mode 100644 index ab9d9646d241..000000000000 --- a/arch/h8300/include/asm/mc146818rtc.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Machine dependent access functions for RTC registers. - */ -#ifndef _H8300_MC146818RTC_H -#define _H8300_MC146818RTC_H - -/* empty include file to satisfy the include in genrtc.c/ide-geometry.c */ - -#endif /* _H8300_MC146818RTC_H */ diff --git a/arch/ia64/include/asm/mc146818rtc.h b/arch/ia64/include/asm/mc146818rtc.h deleted file mode 100644 index 407787a237ba..000000000000 --- a/arch/ia64/include/asm/mc146818rtc.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef _ASM_IA64_MC146818RTC_H -#define _ASM_IA64_MC146818RTC_H - -/* - * Machine dependent access functions for RTC registers. - */ - -/* empty include file to satisfy the include in genrtc.c */ - -#endif /* _ASM_IA64_MC146818RTC_H */ diff --git a/arch/m68k/amiga/config.c b/arch/m68k/amiga/config.c index 01693df7f2f6..ec9cc1fdd237 100644 --- a/arch/m68k/amiga/config.c +++ b/arch/m68k/amiga/config.c @@ -35,7 +35,6 @@ #include <asm/amigahw.h> #include <asm/amigaints.h> #include <asm/irq.h> -#include <asm/rtc.h> #include <asm/machdep.h> #include <asm/io.h> diff --git a/arch/m68k/apollo/config.c b/arch/m68k/apollo/config.c index 6e62d66c396e..432bc8bacfc2 100644 --- a/arch/m68k/apollo/config.c +++ b/arch/m68k/apollo/config.c @@ -15,7 +15,6 @@ #include <asm/pgtable.h> #include <asm/apollohw.h> #include <asm/irq.h> -#include <asm/rtc.h> #include <asm/machdep.h> u_long sio01_physaddr; diff --git a/arch/m68k/bvme6000/config.c b/arch/m68k/bvme6000/config.c index 478623dbb209..611d4d9ea2bd 100644 --- a/arch/m68k/bvme6000/config.c +++ b/arch/m68k/bvme6000/config.c @@ -34,7 +34,6 @@ #include <asm/setup.h> #include <asm/irq.h> #include <asm/traps.h> -#include <asm/rtc.h> #include <asm/machdep.h> #include <asm/bvme6000hw.h> diff --git a/arch/m68k/hp300/config.c b/arch/m68k/hp300/config.c index a9befe65adc4..7cfab158fb61 100644 --- a/arch/m68k/hp300/config.c +++ b/arch/m68k/hp300/config.c @@ -12,6 +12,7 @@ #include <linux/string.h> #include <linux/kernel.h> #include <linux/console.h> +#include <linux/rtc.h> #include <asm/bootinfo.h> #include <asm/bootinfo-hp300.h> @@ -20,7 +21,6 @@ #include <asm/blinken.h> #include <asm/io.h> /* readb() and writeb() */ #include <asm/hp300hw.h> -#include <asm/rtc.h> #include "time.h" diff --git a/arch/m68k/include/asm/flat.h b/arch/m68k/include/asm/flat.h index f9454b89a51b..00c392b0cabd 100644 --- a/arch/m68k/include/asm/flat.h +++ b/arch/m68k/include/asm/flat.h @@ -1,5 +1,5 @@ /* - * include/asm-m68knommu/flat.h -- uClinux flat-format executables + * flat.h -- uClinux flat-format executables */ #ifndef __M68KNOMMU_FLAT_H__ @@ -8,8 +8,9 @@ #define flat_argvp_envp_on_stack() 1 #define flat_old_ram_flag(flags) (flags) #define flat_reloc_valid(reloc, size) ((reloc) <= (size)) -#define flat_get_addr_from_rp(rp, relval, flags, p) get_unaligned(rp) -#define flat_put_addr_at_rp(rp, val, relval) put_unaligned(val,rp) +#define flat_get_addr_from_rp(rp, relval, flags, p) \ + ({ unsigned long __val; __get_user_unaligned(__val, rp); __val; }) +#define flat_put_addr_at_rp(rp, val, relval) __put_user_unaligned(val, rp) #define flat_get_relocate_addr(rel) (rel) static inline int flat_set_persistent(unsigned long relval, @@ -18,4 +19,10 @@ static inline int flat_set_persistent(unsigned long relval, return 0; } +#define FLAT_PLAT_INIT(regs) \ + do { \ + if (current->mm) \ + (regs)->d5 = current->mm->start_data; \ + } while (0) + #endif /* __M68KNOMMU_FLAT_H__ */ diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h index a6ce2ec8d693..c84a2183b3f0 100644 --- a/arch/m68k/include/asm/processor.h +++ b/arch/m68k/include/asm/processor.h @@ -110,7 +110,6 @@ struct thread_struct { #define setframeformat(_regs) do { } while (0) #endif -#ifdef CONFIG_MMU /* * Do necessary setup to start up a newly executed thread. */ @@ -123,26 +122,14 @@ static inline void start_thread(struct pt_regs * regs, unsigned long pc, wrusp(usp); } +#ifdef CONFIG_MMU extern int handle_kernel_fault(struct pt_regs *regs); - #else - -#define start_thread(_regs, _pc, _usp) \ -do { \ - (_regs)->pc = (_pc); \ - setframeformat(_regs); \ - if (current->mm) \ - (_regs)->d5 = current->mm->start_data; \ - (_regs)->sr &= ~0x2000; \ - wrusp(_usp); \ -} while(0) - static inline int handle_kernel_fault(struct pt_regs *regs) { /* Any fault in kernel is fatal on non-mmu */ return 0; } - #endif /* Forward declaration, a strange C thing */ diff --git a/arch/m68k/include/asm/rtc.h b/arch/m68k/include/asm/rtc.h deleted file mode 100644 index a4d08ea122ee..000000000000 --- a/arch/m68k/include/asm/rtc.h +++ /dev/null @@ -1,79 +0,0 @@ -/* include/asm-m68k/rtc.h - * - * Copyright Richard Zidlicky - * implementation details for genrtc/q40rtc driver - */ -/* permission is hereby granted to copy, modify and redistribute this code - * in terms of the GNU Library General Public License, Version 2 or later, - * at your option. - */ - -#ifndef _ASM_RTC_H -#define _ASM_RTC_H - -#ifdef __KERNEL__ - -#include <linux/rtc.h> -#include <asm/errno.h> -#include <asm/machdep.h> - -#define RTC_PIE 0x40 /* periodic interrupt enable */ -#define RTC_AIE 0x20 /* alarm interrupt enable */ -#define RTC_UIE 0x10 /* update-finished interrupt enable */ - -/* some dummy definitions */ -#define RTC_BATT_BAD 0x100 /* battery bad */ -#define RTC_SQWE 0x08 /* enable square-wave output */ -#define RTC_DM_BINARY 0x04 /* all time/date values are BCD if clear */ -#define RTC_24H 0x02 /* 24 hour mode - else hours bit 7 means pm */ -#define RTC_DST_EN 0x01 /* auto switch DST - works f. USA only */ - -static inline unsigned int get_rtc_time(struct rtc_time *time) -{ - /* - * Only the values that we read from the RTC are set. We leave - * tm_wday, tm_yday and tm_isdst untouched. Even though the - * RTC has RTC_DAY_OF_WEEK, we ignore it, as it is only updated - * by the RTC when initially set to a non-zero value. - */ - if (mach_hwclk) - mach_hwclk(0, time); - return RTC_24H; -} - -static inline int set_rtc_time(struct rtc_time *time) -{ - if (mach_hwclk) - return mach_hwclk(1, time); - return -EINVAL; -} - -static inline unsigned int get_rtc_ss(void) -{ - if (mach_get_ss) - return mach_get_ss(); - else{ - struct rtc_time h; - - get_rtc_time(&h); - return h.tm_sec; - } -} - -static inline int get_rtc_pll(struct rtc_pll_info *pll) -{ - if (mach_get_rtc_pll) - return mach_get_rtc_pll(pll); - else - return -EINVAL; -} -static inline int set_rtc_pll(struct rtc_pll_info *pll) -{ - if (mach_set_rtc_pll) - return mach_set_rtc_pll(pll); - else - return -EINVAL; -} -#endif /* __KERNEL__ */ - -#endif /* _ASM__RTC_H */ diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c index 3857737e3958..4e5aa2f4f522 100644 --- a/arch/m68k/kernel/time.c +++ b/arch/m68k/kernel/time.c @@ -86,7 +86,49 @@ void read_persistent_clock(struct timespec *ts) } } -#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET +#if defined(CONFIG_ARCH_USES_GETTIMEOFFSET) && IS_ENABLED(CONFIG_RTC_DRV_GENERIC) +static int rtc_generic_get_time(struct device *dev, struct rtc_time *tm) +{ + mach_hwclk(0, tm); + return rtc_valid_tm(tm); +} + +static int rtc_generic_set_time(struct device *dev, struct rtc_time *tm) +{ + if (mach_hwclk(1, tm) < 0) + return -EOPNOTSUPP; + return 0; +} + +static int rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) +{ + struct rtc_pll_info pll; + struct rtc_pll_info __user *argp = (void __user *)arg; + + switch (cmd) { + case RTC_PLL_GET: + if (!mach_get_rtc_pll || mach_get_rtc_pll(&pll)) + return -EINVAL; + return copy_to_user(argp, &pll, sizeof pll) ? -EFAULT : 0; + + case RTC_PLL_SET: + if (!mach_set_rtc_pll) + return -EINVAL; + if (!capable(CAP_SYS_TIME)) + return -EACCES; + if (copy_from_user(&pll, argp, sizeof(pll))) + return -EFAULT; + return mach_set_rtc_pll(&pll); + } + + return -ENOIOCTLCMD; +} + +static const struct rtc_class_ops generic_rtc_ops = { + .ioctl = rtc_ioctl, + .read_time = rtc_generic_get_time, + .set_time = rtc_generic_set_time, +}; static int __init rtc_init(void) { @@ -95,7 +137,9 @@ static int __init rtc_init(void) if (!mach_hwclk) return -ENODEV; - pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0); + pdev = platform_device_register_data(NULL, "rtc-generic", -1, + &generic_rtc_ops, + sizeof(generic_rtc_ops)); return PTR_ERR_OR_ZERO(pdev); } diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c index 689b47d292ac..2f33a33001e5 100644 --- a/arch/m68k/mac/config.c +++ b/arch/m68k/mac/config.c @@ -10,6 +10,7 @@ * Miscellaneous linux stuff */ +#include <linux/errno.h> #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> @@ -25,6 +26,7 @@ #include <linux/platform_device.h> #include <linux/adb.h> #include <linux/cuda.h> +#include <linux/rtc.h> #include <asm/setup.h> #include <asm/bootinfo.h> @@ -34,7 +36,6 @@ #include <asm/io.h> #include <asm/irq.h> #include <asm/pgtable.h> -#include <asm/rtc.h> #include <asm/machdep.h> #include <asm/macintosh.h> diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c index 707b61aea203..0fb54a90eac2 100644 --- a/arch/m68k/mac/misc.c +++ b/arch/m68k/mac/misc.c @@ -18,7 +18,6 @@ #include <asm/uaccess.h> #include <asm/io.h> -#include <asm/rtc.h> #include <asm/segment.h> #include <asm/setup.h> #include <asm/macintosh.h> diff --git a/arch/m68k/mvme147/config.c b/arch/m68k/mvme147/config.c index e6a3b56c6481..c11d38dfad08 100644 --- a/arch/m68k/mvme147/config.c +++ b/arch/m68k/mvme147/config.c @@ -32,7 +32,6 @@ #include <asm/setup.h> #include <asm/irq.h> #include <asm/traps.h> -#include <asm/rtc.h> #include <asm/machdep.h> #include <asm/mvme147hw.h> diff --git a/arch/m68k/mvme16x/config.c b/arch/m68k/mvme16x/config.c index a53803cc66cd..58e240939d26 100644 --- a/arch/m68k/mvme16x/config.c +++ b/arch/m68k/mvme16x/config.c @@ -35,7 +35,6 @@ #include <asm/setup.h> #include <asm/irq.h> #include <asm/traps.h> -#include <asm/rtc.h> #include <asm/machdep.h> #include <asm/mvme16xhw.h> diff --git a/arch/m68k/q40/config.c b/arch/m68k/q40/config.c index e90fe903613e..fcb7f05b60b6 100644 --- a/arch/m68k/q40/config.c +++ b/arch/m68k/q40/config.c @@ -12,6 +12,7 @@ * for more details. */ +#include <linux/errno.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -27,7 +28,6 @@ #include <linux/platform_device.h> #include <asm/io.h> -#include <asm/rtc.h> #include <asm/bootinfo.h> #include <asm/pgtable.h> #include <asm/setup.h> diff --git a/arch/m68k/sun3/config.c b/arch/m68k/sun3/config.c index 71884bf01d72..3af34fa3a344 100644 --- a/arch/m68k/sun3/config.c +++ b/arch/m68k/sun3/config.c @@ -26,7 +26,6 @@ #include <asm/pgalloc.h> #include <asm/sun3-head.h> #include <asm/sun3mmu.h> -#include <asm/rtc.h> #include <asm/machdep.h> #include <asm/machines.h> #include <asm/idprom.h> diff --git a/arch/m68k/sun3/intersil.c b/arch/m68k/sun3/intersil.c index 889829e11f1d..2cd0bcbe6f30 100644 --- a/arch/m68k/sun3/intersil.c +++ b/arch/m68k/sun3/intersil.c @@ -14,8 +14,8 @@ #include <linux/rtc.h> #include <asm/errno.h> -#include <asm/rtc.h> #include <asm/intersil.h> +#include <asm/machdep.h> /* bits to set for start/run of the intersil */ diff --git a/arch/m68k/sun3x/time.c b/arch/m68k/sun3x/time.c index c8eb08add6b0..431d3c4306dd 100644 --- a/arch/m68k/sun3x/time.c +++ b/arch/m68k/sun3x/time.c @@ -15,10 +15,10 @@ #include <asm/irq.h> #include <asm/io.h> +#include <asm/machdep.h> #include <asm/traps.h> #include <asm/sun3x.h> #include <asm/sun3ints.h> -#include <asm/rtc.h> #include "time.h" diff --git a/arch/metag/include/asm/cmpxchg_lnkget.h b/arch/metag/include/asm/cmpxchg_lnkget.h index 0154e2807ebb..2369ad394876 100644 --- a/arch/metag/include/asm/cmpxchg_lnkget.h +++ b/arch/metag/include/asm/cmpxchg_lnkget.h @@ -73,7 +73,7 @@ static inline unsigned long __cmpxchg_u32(volatile int *m, unsigned long old, " DCACHE [%2], %0\n" #endif "2:\n" - : "=&d" (temp), "=&da" (retval) + : "=&d" (temp), "=&d" (retval) : "da" (m), "bd" (old), "da" (new) : "cc" ); diff --git a/arch/metag/include/asm/metag_mem.h b/arch/metag/include/asm/metag_mem.h index aa5a076df439..7848bc6d3b61 100644 --- a/arch/metag/include/asm/metag_mem.h +++ b/arch/metag/include/asm/metag_mem.h @@ -881,7 +881,7 @@ #define PERFCTRL_DCSTALL 11 /* Dcache+TLB o/p delayed (per-thread) */ #define PERFCTRL_ICSTALL 12 /* Icache+TLB o/p delayed (per-thread) */ -#define PERFCTRL_INT 13 /* Internal core delailed events (see next) */ +#define PERFCTRL_INT 13 /* Internal core detailed events (see next) */ #define PERFCTRL_EXT 15 /* External source in core periphery */ #endif /* METAC_2_1 */ diff --git a/arch/metag/include/asm/metag_regs.h b/arch/metag/include/asm/metag_regs.h index 40c3f679c5b8..60b750971d8a 100644 --- a/arch/metag/include/asm/metag_regs.h +++ b/arch/metag/include/asm/metag_regs.h @@ -179,7 +179,7 @@ ; is best to dump these registers immediately at the start of a routine ; using a MSETL or SETL instruction- ; -; MSETL [A0StP],D0Ar6,D0Ar4,D0Ar2; Only dump argments expected +; MSETL [A0StP],D0Ar6,D0Ar4,D0Ar2; Only dump arguments expected ;or SETL [A0StP+#8++],D0Ar2 ; Up to two 32-bit args expected ; ; For non-leaf routines it is always necessary to save and restore at least diff --git a/arch/metag/kernel/cachepart.c b/arch/metag/kernel/cachepart.c index 04b7d4f8429a..db944c2e7d88 100644 --- a/arch/metag/kernel/cachepart.c +++ b/arch/metag/kernel/cachepart.c @@ -15,7 +15,7 @@ #define SYSC_DCPART(n) (SYSC_DCPART0 + SYSC_xCPARTn_STRIDE * (n)) #define SYSC_ICPART(n) (SYSC_ICPART0 + SYSC_xCPARTn_STRIDE * (n)) -#define CACHE_ASSOCIATIVITY 4 /* 4 way set-assosiative */ +#define CACHE_ASSOCIATIVITY 4 /* 4 way set-associative */ #define ICACHE 0 #define DCACHE 1 diff --git a/arch/metag/lib/divsi3.S b/arch/metag/lib/divsi3.S index 7c8a8ae9a0a1..11124cc93dee 100644 --- a/arch/metag/lib/divsi3.S +++ b/arch/metag/lib/divsi3.S @@ -50,7 +50,7 @@ $LIDMCQuick: ADDCC D0Re0,D0Re0,#1 ! If yes result += 1 SUBCC D1Ar1,D1Ar1,D1Re0 ! and A -= Bu ORS D0Ar4,D0Ar4,D0Ar4 ! Return neg result? - NEG D0Ar2,D0Re0 ! Calulate neg result + NEG D0Ar2,D0Re0 ! Calculate neg result MOVMI D0Re0,D0Ar2 ! Yes: Take neg result $LIDMCRet: MOV PC,D1RtP @@ -94,7 +94,7 @@ $LIDMCLoop: LSR D1Re0, D1Re0, #1 ! Shift down B BNZ $LIDMCLoop ! Was single bit in curbit lost? ORS D0Ar4,D0Ar4,D0Ar4 ! Return neg result? - NEG D0Ar2,D0Re0 ! Calulate neg result + NEG D0Ar2,D0Re0 ! Calculate neg result MOVMI D0Re0,D0Ar2 ! Yes: Take neg result MOV PC,D1RtP .size ___divsi3,.-___divsi3 diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c index 372783a67dda..c765b3621b9b 100644 --- a/arch/metag/mm/fault.c +++ b/arch/metag/mm/fault.c @@ -187,7 +187,7 @@ bad_area_nosemaphore: if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && printk_ratelimit()) { - pr_info("%s%s[%d]: segfault at %lx pc %08x sp %08x write %d trap %#x (%s)", + printk("%s%s[%d]: segfault at %lx pc %08x sp %08x write %d trap %#x (%s)", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, regs->ctx.CurrPC, regs->ctx.AX[0].U0, diff --git a/arch/mips/sgi-ip22/ip22-reset.c b/arch/mips/sgi-ip22/ip22-reset.c index 063c2dd31e72..2f45b0357021 100644 --- a/arch/mips/sgi-ip22/ip22-reset.c +++ b/arch/mips/sgi-ip22/ip22-reset.c @@ -7,7 +7,7 @@ */ #include <linux/linkage.h> #include <linux/init.h> -#include <linux/ds1286.h> +#include <linux/rtc/ds1286.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/kernel.h> diff --git a/arch/mips/sni/time.c b/arch/mips/sni/time.c index fb4b3520cdc6..7ee14f41fc25 100644 --- a/arch/mips/sni/time.c +++ b/arch/mips/sni/time.c @@ -8,7 +8,6 @@ #include <asm/sni.h> #include <asm/time.h> -#include <asm-generic/rtc.h> #define SNI_CLOCK_TICK_RATE 3686400 #define SNI_COUNTER2_DIV 64 diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index 9627e81a6cbb..38e3494bfb63 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -236,7 +236,9 @@ source "kernel/Kconfig.hz" config MN10300_RTC bool "Using MN10300 RTC" depends on MN10300_PROC_MN103E010 || MN10300_PROC_MN2WS0050 - select GENERIC_CMOS_UPDATE + select RTC_CLASS + select RTC_DRV_CMOS + select RTC_SYSTOHC default n help This option enables support for the RTC, thus enabling time to be diff --git a/arch/mn10300/include/asm/rtc-regs.h b/arch/mn10300/include/asm/rtc-regs.h index c42deefaec11..c81cacecb6e3 100644 --- a/arch/mn10300/include/asm/rtc-regs.h +++ b/arch/mn10300/include/asm/rtc-regs.h @@ -75,9 +75,9 @@ #define RTC_PORT(x) 0xd8600000 #define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */ -#define CMOS_READ(addr) __SYSREG(0xd8600000 + (addr), u8) +#define CMOS_READ(addr) __SYSREG(0xd8600000 + (u32)(addr), u8) #define CMOS_WRITE(val, addr) \ - do { __SYSREG(0xd8600000 + (addr), u8) = val; } while (0) + do { __SYSREG(0xd8600000 + (u32)(addr), u8) = val; } while (0) #define RTC_IRQ RTIRQ diff --git a/arch/mn10300/include/asm/rtc.h b/arch/mn10300/include/asm/rtc.h index 6c14bb1d0d9b..07dc87656197 100644 --- a/arch/mn10300/include/asm/rtc.h +++ b/arch/mn10300/include/asm/rtc.h @@ -25,6 +25,4 @@ static inline void calibrate_clock(void) #endif /* !CONFIG_MN10300_RTC */ -#include <asm-generic/rtc.h> - #endif /* _ASM_RTC_H */ diff --git a/arch/mn10300/kernel/rtc.c b/arch/mn10300/kernel/rtc.c index 48d7058b3295..f81f37025072 100644 --- a/arch/mn10300/kernel/rtc.c +++ b/arch/mn10300/kernel/rtc.c @@ -12,107 +12,19 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/mc146818rtc.h> -#include <linux/bcd.h> -#include <linux/timex.h> +#include <linux/ioport.h> +#include <linux/platform_device.h> + #include <asm/rtc-regs.h> #include <asm/rtc.h> DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); -/* - * Read the current RTC time - */ -void read_persistent_clock(struct timespec *ts) -{ - struct rtc_time tm; - - get_rtc_time(&tm); - - ts->tv_nsec = 0; - ts->tv_sec = mktime(tm.tm_year, tm.tm_mon, tm.tm_mday, - tm.tm_hour, tm.tm_min, tm.tm_sec); - - /* if rtc is way off in the past, set something reasonable */ - if (ts->tv_sec < 0) - ts->tv_sec = mktime(2009, 1, 1, 12, 0, 0); -} - -/* - * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500 - * ms after the second nowtime has started, because when nowtime is written - * into the registers of the CMOS clock, it will jump to the next second - * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data - * sheet for details. - * - * BUG: This routine does not handle hour overflow properly; it just - * sets the minutes. Usually you'll only notice that after reboot! - */ -static int set_rtc_mmss(unsigned long nowtime) -{ - unsigned char save_control, save_freq_select; - int retval = 0; - int real_seconds, real_minutes, cmos_minutes; - - /* gets recalled with irq locally disabled */ - spin_lock(&rtc_lock); - save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being - * set */ - CMOS_WRITE(save_control | RTC_SET, RTC_CONTROL); - - save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset - * prescaler */ - CMOS_WRITE(save_freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT); - - cmos_minutes = CMOS_READ(RTC_MINUTES); - if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - cmos_minutes = bcd2bin(cmos_minutes); - - /* - * since we're only adjusting minutes and seconds, - * don't interfere with hour overflow. This avoids - * messing with unknown time zones but requires your - * RTC not to be off by more than 15 minutes - */ - real_seconds = nowtime % 60; - real_minutes = nowtime / 60; - if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1) - /* correct for half hour time zone */ - real_minutes += 30; - real_minutes %= 60; - - if (abs(real_minutes - cmos_minutes) < 30) { - if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - real_seconds = bin2bcd(real_seconds); - real_minutes = bin2bcd(real_minutes); - } - CMOS_WRITE(real_seconds, RTC_SECONDS); - CMOS_WRITE(real_minutes, RTC_MINUTES); - } else { - printk_once(KERN_NOTICE - "set_rtc_mmss: can't update from %d to %d\n", - cmos_minutes, real_minutes); - retval = -1; - } - - /* The following flags have to be released exactly in this order, - * otherwise the DS12887 (popular MC146818A clone with integrated - * battery and quartz) will not reset the oscillator and will not - * update precisely 500 ms later. You won't find this mentioned in - * the Dallas Semiconductor data sheets, but who believes data - * sheets anyway ... -- Markus Kuhn - */ - CMOS_WRITE(save_control, RTC_CONTROL); - CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - spin_unlock(&rtc_lock); - - return retval; -} - -int update_persistent_clock(struct timespec now) -{ - return set_rtc_mmss(now.tv_sec); -} +static const __initdata struct resource res[] = { + DEFINE_RES_IO(RTC_PORT(0), RTC_IO_EXTENT), + DEFINE_RES_IRQ(RTC_IRQ), +}; /* * calibrate the TSC clock against the RTC @@ -129,4 +41,6 @@ void __init calibrate_clock(void) RTCRA |= RTCRA_DVR; RTCRA &= ~RTCRA_DVR; RTCRB &= ~RTCRB_SET; + + platform_device_register_simple("rtc_cmos", -1, res, ARRAY_SIZE(res)); } diff --git a/arch/mn10300/proc-mn103e010/proc-init.c b/arch/mn10300/proc-mn103e010/proc-init.c index 27b97980dca4..102d86a6ae56 100644 --- a/arch/mn10300/proc-mn103e010/proc-init.c +++ b/arch/mn10300/proc-mn103e010/proc-init.c @@ -9,7 +9,10 @@ * 2 of the Licence, or (at your option) any later version. */ #include <linux/kernel.h> +#include <linux/irq.h> +#include <asm/cacheflush.h> #include <asm/fpu.h> +#include <asm/irq.h> #include <asm/rtc.h> #include <asm/busctl-regs.h> diff --git a/arch/mn10300/proc-mn2ws0050/proc-init.c b/arch/mn10300/proc-mn2ws0050/proc-init.c index ee6d03dbc8d8..950cc8dbb284 100644 --- a/arch/mn10300/proc-mn2ws0050/proc-init.c +++ b/arch/mn10300/proc-mn2ws0050/proc-init.c @@ -14,6 +14,7 @@ #include <linux/delay.h> #include <linux/interrupt.h> +#include <asm/cacheflush.h> #include <asm/processor.h> #include <asm/uaccess.h> #include <asm/io.h> diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index dc117385ce2e..cd8778103165 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -31,6 +31,7 @@ config PARISC select TTY # Needed for pdc_cons.c select HAVE_DEBUG_STACKOVERFLOW select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_HASH select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_UNSTABLE_SCHED_CLOCK if (SMP || !64BIT) diff --git a/arch/parisc/include/asm/hash.h b/arch/parisc/include/asm/hash.h new file mode 100644 index 000000000000..dbe93311aa26 --- /dev/null +++ b/arch/parisc/include/asm/hash.h @@ -0,0 +1,146 @@ +#ifndef _ASM_HASH_H +#define _ASM_HASH_H + +/* + * HP-PA only implements integer multiply in the FPU. However, for + * integer multiplies by constant, it has a number of shift-and-add + * (but no shift-and-subtract, sigh!) instructions that a compiler + * can synthesize a code sequence with. + * + * Unfortunately, GCC isn't very efficient at using them. For example + * it uses three instructions for "x *= 21" when only two are needed. + * But we can find a sequence manually. + */ + +#define HAVE_ARCH__HASH_32 1 + +/* + * This is a multiply by GOLDEN_RATIO_32 = 0x61C88647 optimized for the + * PA7100 pairing rules. This is an in-order 2-way superscalar processor. + * Only one instruction in a pair may be a shift (by more than 3 bits), + * but other than that, simple ALU ops (including shift-and-add by up + * to 3 bits) may be paired arbitrarily. + * + * PA8xxx processors also dual-issue ALU instructions, although with + * fewer constraints, so this schedule is good for them, too. + * + * This 6-step sequence was found by Yevgen Voronenko's implementation + * of the Hcub algorithm at http://spiral.ece.cmu.edu/mcm/gen.html. + */ +static inline u32 __attribute_const__ __hash_32(u32 x) +{ + u32 a, b, c; + + /* + * Phase 1: Compute a = (x << 19) + x, + * b = (x << 9) + a, c = (x << 23) + b. + */ + a = x << 19; /* Two shifts can't be paired */ + b = x << 9; a += x; + c = x << 23; b += a; + c += b; + /* Phase 2: Return (b<<11) + (c<<6) + (a<<3) - c */ + b <<= 11; + a += c << 3; b -= c; + return (a << 3) + b; +} + +#if BITS_PER_LONG == 64 + +#define HAVE_ARCH_HASH_64 1 + +/* + * Finding a good shift-and-add chain for GOLDEN_RATIO_64 is tricky, + * because available software for the purpose chokes on constants this + * large. (It's mostly designed for compiling FIR filter coefficients + * into FPGAs.) + * + * However, Jason Thong pointed out a work-around. The Hcub software + * (http://spiral.ece.cmu.edu/mcm/gen.html) is designed for *multiple* + * constant multiplication, and is good at finding shift-and-add chains + * which share common terms. + * + * Looking at 0x0x61C8864680B583EB in binary: + * 0110000111001000100001100100011010000000101101011000001111101011 + * \______________/ \__________/ \_______/ \________/ + * \____________________________/ \____________________/ + * you can see the non-zero bits are divided into several well-separated + * blocks. Hcub can find algorithms for those terms separately, which + * can then be shifted and added together. + * + * Dividing the input into 2, 3 or 4 blocks, Hcub can find solutions + * with 10, 9 or 8 adds, respectively, making a total of 11 for the + * whole number. + * + * Using just two large blocks, 0xC3910C8D << 31 in the high bits, + * and 0xB583EB in the low bits, produces as good an algorithm as any, + * and with one more small shift than alternatives. + * + * The high bits are a larger number and more work to compute, as well + * as needing one extra cycle to shift left 31 bits before the final + * addition, so they are the critical path for scheduling. The low bits + * can fit into the scheduling slots left over. + */ + + +/* + * This _ASSIGN(dst, src) macro performs "dst = src", but prevents GCC + * from inferring anything about the value assigned to "dest". + * + * This prevents it from mis-optimizing certain sequences. + * In particular, gcc is annoyingly eager to combine consecutive shifts. + * Given "x <<= 19; y += x; z += x << 1;", GCC will turn this into + * "y += x << 19; z += x << 20;" even though the latter sequence needs + * an additional instruction and temporary register. + * + * Because no actual assembly code is generated, this construct is + * usefully portable across all GCC platforms, and so can be test-compiled + * on non-PA systems. + * + * In two places, additional unused input dependencies are added. This + * forces GCC's scheduling so it does not rearrange instructions too much. + * Because the PA-8xxx is out of order, I'm not sure how much this matters, + * but why make it more difficult for the processor than necessary? + */ +#define _ASSIGN(dst, src, ...) asm("" : "=r" (dst) : "0" (src), ##__VA_ARGS__) + +/* + * Multiply by GOLDEN_RATIO_64 = 0x0x61C8864680B583EB using a heavily + * optimized shift-and-add sequence. + * + * Without the final shift, the multiply proper is 19 instructions, + * 10 cycles and uses only 4 temporaries. Whew! + * + * You are not expected to understand this. + */ +static __always_inline u32 __attribute_const__ +hash_64(u64 a, unsigned int bits) +{ + u64 b, c, d; + + /* + * Encourage GCC to move a dynamic shift to %sar early, + * thereby freeing up an additional temporary register. + */ + if (!__builtin_constant_p(bits)) + asm("" : "=q" (bits) : "0" (64 - bits)); + else + bits = 64 - bits; + + _ASSIGN(b, a*5); c = a << 13; + b = (b << 2) + a; _ASSIGN(d, a << 17); + a = b + (a << 1); c += d; + d = a << 10; _ASSIGN(a, a << 19); + d = a - d; _ASSIGN(a, a << 4, "X" (d)); + c += b; a += b; + d -= c; c += a << 1; + a += c << 3; _ASSIGN(b, b << (7+31), "X" (c), "X" (d)); + a <<= 31; b += d; + a += b; + return a >> bits; +} +#undef _ASSIGN /* We're a widely-used header file, so don't litter! */ + +#endif /* BITS_PER_LONG == 64 */ + +#endif /* _ASM_HASH_H */ diff --git a/arch/parisc/include/asm/mc146818rtc.h b/arch/parisc/include/asm/mc146818rtc.h deleted file mode 100644 index adf41631449f..000000000000 --- a/arch/parisc/include/asm/mc146818rtc.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Machine dependent access functions for RTC registers. - */ -#ifndef _ASM_MC146818RTC_H -#define _ASM_MC146818RTC_H - -/* empty include file to satisfy the include in genrtc.c */ - -#endif /* _ASM_MC146818RTC_H */ diff --git a/arch/parisc/include/asm/rtc.h b/arch/parisc/include/asm/rtc.h deleted file mode 100644 index 099d641a42c2..000000000000 --- a/arch/parisc/include/asm/rtc.h +++ /dev/null @@ -1,131 +0,0 @@ -/* - * include/asm-parisc/rtc.h - * - * Copyright 2002 Randolph CHung <tausq@debian.org> - * - * Based on: include/asm-ppc/rtc.h and the genrtc driver in the - * 2.4 parisc linux tree - */ - -#ifndef __ASM_RTC_H__ -#define __ASM_RTC_H__ - -#ifdef __KERNEL__ - -#include <linux/rtc.h> - -#include <asm/pdc.h> - -#define SECS_PER_HOUR (60 * 60) -#define SECS_PER_DAY (SECS_PER_HOUR * 24) - - -#define RTC_PIE 0x40 /* periodic interrupt enable */ -#define RTC_AIE 0x20 /* alarm interrupt enable */ -#define RTC_UIE 0x10 /* update-finished interrupt enable */ - -#define RTC_BATT_BAD 0x100 /* battery bad */ - -/* some dummy definitions */ -#define RTC_SQWE 0x08 /* enable square-wave output */ -#define RTC_DM_BINARY 0x04 /* all time/date values are BCD if clear */ -#define RTC_24H 0x02 /* 24 hour mode - else hours bit 7 means pm */ -#define RTC_DST_EN 0x01 /* auto switch DST - works f. USA only */ - -# define __isleap(year) \ - ((year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0)) - -/* How many days come before each month (0-12). */ -static const unsigned short int __mon_yday[2][13] = -{ - /* Normal years. */ - { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 }, - /* Leap years. */ - { 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 } -}; - -static inline unsigned int get_rtc_time(struct rtc_time *wtime) -{ - struct pdc_tod tod_data; - long int days, rem, y; - const unsigned short int *ip; - - memset(wtime, 0, sizeof(*wtime)); - if (pdc_tod_read(&tod_data) < 0) - return RTC_24H | RTC_BATT_BAD; - - // most of the remainder of this function is: -// Copyright (C) 1991, 1993, 1997, 1998 Free Software Foundation, Inc. -// This was originally a part of the GNU C Library. -// It is distributed under the GPL, and was swiped from offtime.c - - - days = tod_data.tod_sec / SECS_PER_DAY; - rem = tod_data.tod_sec % SECS_PER_DAY; - - wtime->tm_hour = rem / SECS_PER_HOUR; - rem %= SECS_PER_HOUR; - wtime->tm_min = rem / 60; - wtime->tm_sec = rem % 60; - - y = 1970; - -#define DIV(a, b) ((a) / (b) - ((a) % (b) < 0)) -#define LEAPS_THRU_END_OF(y) (DIV (y, 4) - DIV (y, 100) + DIV (y, 400)) - - while (days < 0 || days >= (__isleap (y) ? 366 : 365)) - { - /* Guess a corrected year, assuming 365 days per year. */ - long int yg = y + days / 365 - (days % 365 < 0); - - /* Adjust DAYS and Y to match the guessed year. */ - days -= ((yg - y) * 365 - + LEAPS_THRU_END_OF (yg - 1) - - LEAPS_THRU_END_OF (y - 1)); - y = yg; - } - wtime->tm_year = y - 1900; - - ip = __mon_yday[__isleap(y)]; - for (y = 11; days < (long int) ip[y]; --y) - continue; - days -= ip[y]; - wtime->tm_mon = y; - wtime->tm_mday = days + 1; - - return RTC_24H; -} - -static int set_rtc_time(struct rtc_time *wtime) -{ - u_int32_t secs; - - secs = mktime(wtime->tm_year + 1900, wtime->tm_mon + 1, wtime->tm_mday, - wtime->tm_hour, wtime->tm_min, wtime->tm_sec); - - if(pdc_tod_set(secs, 0) < 0) - return -1; - else - return 0; - -} - -static inline unsigned int get_rtc_ss(void) -{ - struct rtc_time h; - - get_rtc_time(&h); - return h.tm_sec; -} - -static inline int get_rtc_pll(struct rtc_pll_info *pll) -{ - return -EINVAL; -} -static inline int set_rtc_pll(struct rtc_pll_info *pll) -{ - return -EINVAL; -} - -#endif /* __KERNEL__ */ -#endif /* __ASM_RTC_H__ */ diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c index 22395901d47b..e5d71905cad5 100644 --- a/arch/parisc/kernel/firmware.c +++ b/arch/parisc/kernel/firmware.c @@ -1354,9 +1354,9 @@ int pdc_pat_io_pci_cfg_read(unsigned long pci_addr, int pci_size, u32 *mem_addr) retval = mem_pdc_call(PDC_PAT_IO, PDC_PAT_IO_PCI_CONFIG_READ, __pa(pdc_result), pci_addr, pci_size); switch(pci_size) { - case 1: *(u8 *) mem_addr = (u8) pdc_result[0]; - case 2: *(u16 *)mem_addr = (u16) pdc_result[0]; - case 4: *(u32 *)mem_addr = (u32) pdc_result[0]; + case 1: *(u8 *) mem_addr = (u8) pdc_result[0]; break; + case 2: *(u16 *)mem_addr = (u16) pdc_result[0]; break; + case 4: *(u32 *)mem_addr = (u32) pdc_result[0]; break; } spin_unlock_irqrestore(&pdc_lock, flags); diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index 31ec99a5f119..505cf1ac5af2 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -12,6 +12,7 @@ */ #include <linux/errno.h> #include <linux/module.h> +#include <linux/rtc.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/param.h> @@ -248,14 +249,47 @@ void __init start_cpu_itimer(void) per_cpu(cpu_data, cpu).it_value = next_tick; } +#if IS_ENABLED(CONFIG_RTC_DRV_GENERIC) +static int rtc_generic_get_time(struct device *dev, struct rtc_time *tm) +{ + struct pdc_tod tod_data; + + memset(tm, 0, sizeof(*tm)); + if (pdc_tod_read(&tod_data) < 0) + return -EOPNOTSUPP; + + /* we treat tod_sec as unsigned, so this can work until year 2106 */ + rtc_time64_to_tm(tod_data.tod_sec, tm); + return rtc_valid_tm(tm); +} + +static int rtc_generic_set_time(struct device *dev, struct rtc_time *tm) +{ + time64_t secs = rtc_tm_to_time64(tm); + + if (pdc_tod_set(secs, 0) < 0) + return -EOPNOTSUPP; + + return 0; +} + +static const struct rtc_class_ops rtc_generic_ops = { + .read_time = rtc_generic_get_time, + .set_time = rtc_generic_set_time, +}; + static int __init rtc_init(void) { struct platform_device *pdev; - pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0); + pdev = platform_device_register_data(NULL, "rtc-generic", -1, + &rtc_generic_ops, + sizeof(rtc_generic_ops)); + return PTR_ERR_OR_ZERO(pdev); } device_initcall(rtc_init); +#endif void read_persistent_clock(struct timespec *ts) { diff --git a/arch/parisc/lib/iomap.c b/arch/parisc/lib/iomap.c index fb8e10a4fb39..eaffbb90aa14 100644 --- a/arch/parisc/lib/iomap.c +++ b/arch/parisc/lib/iomap.c @@ -125,22 +125,22 @@ static void ioport_write32r(void __iomem *addr, const void *s, unsigned long n) } static const struct iomap_ops ioport_ops = { - ioport_read8, - ioport_read16, - ioport_read16, - ioport_read32, - ioport_read32, - ioport_write8, - ioport_write16, - ioport_write16, - ioport_write32, - ioport_write32, - ioport_read8r, - ioport_read16r, - ioport_read32r, - ioport_write8r, - ioport_write16r, - ioport_write32r, + .read8 = ioport_read8, + .read16 = ioport_read16, + .read16be = ioport_read16, + .read32 = ioport_read32, + .read32be = ioport_read32, + .write8 = ioport_write8, + .write16 = ioport_write16, + .write16be = ioport_write16, + .write32 = ioport_write32, + .write32be = ioport_write32, + .read8r = ioport_read8r, + .read16r = ioport_read16r, + .read32r = ioport_read32r, + .write8r = ioport_write8r, + .write16r = ioport_write16r, + .write32r = ioport_write32r, }; /* Legacy I/O memory ops */ @@ -244,22 +244,22 @@ static void iomem_write32r(void __iomem *addr, const void *s, unsigned long n) } static const struct iomap_ops iomem_ops = { - iomem_read8, - iomem_read16, - iomem_read16be, - iomem_read32, - iomem_read32be, - iomem_write8, - iomem_write16, - iomem_write16be, - iomem_write32, - iomem_write32be, - iomem_read8r, - iomem_read16r, - iomem_read32r, - iomem_write8r, - iomem_write16r, - iomem_write32r, + .read8 = iomem_read8, + .read16 = iomem_read16, + .read16be = iomem_read16be, + .read32 = iomem_read32, + .read32be = iomem_read32be, + .write8 = iomem_write8, + .write16 = iomem_write16, + .write16be = iomem_write16be, + .write32 = iomem_write32, + .write32be = iomem_write32be, + .read8r = iomem_read8r, + .read16r = iomem_read16r, + .read32r = iomem_read32r, + .write8r = iomem_write8r, + .write16r = iomem_write16r, + .write32r = iomem_write32r, }; static const struct iomap_ops *iomap_ops[8] = { diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 171047822b56..63292f64b25a 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -60,6 +60,25 @@ config CODE_PATCHING_SELFTEST depends on DEBUG_KERNEL default n +config JUMP_LABEL_FEATURE_CHECKS + bool "Enable use of jump label for cpu/mmu_has_feature()" + depends on JUMP_LABEL + default y + help + Selecting this options enables use of jump labels for some internal + feature checks. This should generate more optimal code for those + checks. + +config JUMP_LABEL_FEATURE_CHECK_DEBUG + bool "Do extra check on feature fixup calls" + depends on DEBUG_KERNEL && JUMP_LABEL_FEATURE_CHECKS + default n + help + This tries to catch incorrect usage of cpu_has_feature() and + mmu_has_feature() in the code. + + If you don't know what this means, say N. + config FTR_FIXUP_SELFTEST bool "Run self-tests of the feature-fixup code" depends on DEBUG_KERNEL diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h b/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h index 60f47649306f..c45189aa7476 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h @@ -11,4 +11,19 @@ extern unsigned long radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); + +static inline int hstate_get_psize(struct hstate *hstate) +{ + unsigned long shift; + + shift = huge_page_shift(hstate); + if (shift == mmu_psize_defs[MMU_PAGE_2M].shift) + return MMU_PAGE_2M; + else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift) + return MMU_PAGE_1G; + else { + WARN(1, "Wrong huge page shift\n"); + return mmu_virtual_psize; + } +} #endif diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 5eaf86ac143d..287a656ceb57 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -24,6 +24,7 @@ #include <asm/book3s/64/pgtable.h> #include <asm/bug.h> #include <asm/processor.h> +#include <asm/cpu_has_feature.h> /* * SLB @@ -190,6 +191,15 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) BUG(); } +static inline unsigned long get_sllp_encoding(int psize) +{ + unsigned long sllp; + + sllp = ((mmu_psize_defs[psize].sllp & SLB_VSID_L) >> 6) | + ((mmu_psize_defs[psize].sllp & SLB_VSID_LP) >> 4); + return sllp; +} + #endif /* __ASSEMBLY__ */ /* diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index d4eda6420523..8afb0e00f7d9 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -23,13 +23,6 @@ struct mmu_psize_def { }; extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; -#ifdef CONFIG_PPC_RADIX_MMU -#define radix_enabled() mmu_has_feature(MMU_FTR_RADIX) -#else -#define radix_enabled() (0) -#endif - - #endif /* __ASSEMBLY__ */ /* 64-bit classic hash table MMU */ @@ -107,6 +100,9 @@ extern int mmu_vmemmap_psize; extern int mmu_io_psize; /* MMU initialization */ +void mmu_early_init_devtree(void); +void hash__early_init_devtree(void); +void radix__early_init_devtree(void); extern void radix_init_native(void); extern void hash__early_init_mmu(void); extern void radix__early_init_mmu(void); @@ -132,11 +128,15 @@ extern void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { - if (radix_enabled()) + if (early_radix_enabled()) return radix__setup_initial_memory_limit(first_memblock_base, first_memblock_size); return hash__setup_initial_memory_limit(first_memblock_base, first_memblock_size); } + +extern int (*register_process_table)(unsigned long base, unsigned long page_size, + unsigned long tbl_size); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index f12ddf5e8de5..2f6373144e2c 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -75,11 +75,6 @@ static inline void hash__flush_tlb_page(struct vm_area_struct *vma, { } -static inline void hash__flush_tlb_page_nohash(struct vm_area_struct *vma, - unsigned long vmaddr) -{ -} - static inline void hash__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h index 00703e7e4c94..65037762b120 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -10,26 +10,32 @@ static inline int mmu_get_ap(int psize) return mmu_psize_defs[psize].ap; } +extern void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end); +extern void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long end, int psize); +extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end); extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end); extern void radix__local_flush_tlb_mm(struct mm_struct *mm); extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); -extern void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, - unsigned long ap, int nid); extern void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr); +extern void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, + int psize); extern void radix__tlb_flush(struct mmu_gather *tlb); #ifdef CONFIG_SMP extern void radix__flush_tlb_mm(struct mm_struct *mm); extern void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); -extern void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, - unsigned long ap, int nid); extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr); +extern void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, + int psize); #else #define radix__flush_tlb_mm(mm) radix__local_flush_tlb_mm(mm) #define radix__flush_tlb_page(vma,addr) radix__local_flush_tlb_page(vma,addr) -#define radix___flush_tlb_page(mm,addr,p,i) radix___local_flush_tlb_page(mm,addr,p,i) +#define radix__flush_tlb_page_psize(mm,addr,p) radix__local_flush_tlb_page_psize(mm,addr,p) #define radix__flush_tlb_pwc(tlb, addr) radix__local_flush_tlb_pwc(tlb, addr) #endif extern void radix__flush_tlb_lpid_va(unsigned long lpid, unsigned long gpa, diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index 96e5769b18b0..72b925f97bab 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -7,6 +7,25 @@ #include <asm/book3s/64/tlbflush-hash.h> #include <asm/book3s/64/tlbflush-radix.h> +#define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE +static inline void flush_pmd_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (radix_enabled()) + return radix__flush_pmd_tlb_range(vma, start, end); + return hash__flush_tlb_range(vma, start, end); +} + +#define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE +static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, + unsigned long start, + unsigned long end) +{ + if (radix_enabled()) + return radix__flush_hugetlb_tlb_range(vma, start, end); + return hash__flush_tlb_range(vma, start, end); +} + static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { @@ -38,14 +57,6 @@ static inline void local_flush_tlb_page(struct vm_area_struct *vma, return hash__local_flush_tlb_page(vma, vmaddr); } -static inline void flush_tlb_page_nohash(struct vm_area_struct *vma, - unsigned long vmaddr) -{ - if (radix_enabled()) - return radix__flush_tlb_page(vma, vmaddr); - return hash__flush_tlb_page_nohash(vma, vmaddr); -} - static inline void tlb_flush(struct mmu_gather *tlb) { if (radix_enabled()) diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h index 69fb16d7a811..b77f0364df94 100644 --- a/arch/powerpc/include/asm/cacheflush.h +++ b/arch/powerpc/include/asm/cacheflush.h @@ -11,6 +11,7 @@ #include <linux/mm.h> #include <asm/cputable.h> +#include <asm/cpu_has_feature.h> /* * No cache flushing is required when address mappings are changed, diff --git a/arch/powerpc/include/asm/cpu_has_feature.h b/arch/powerpc/include/asm/cpu_has_feature.h new file mode 100644 index 000000000000..2ef55f8968a2 --- /dev/null +++ b/arch/powerpc/include/asm/cpu_has_feature.h @@ -0,0 +1,53 @@ +#ifndef __ASM_POWERPC_CPUFEATURES_H +#define __ASM_POWERPC_CPUFEATURES_H + +#ifndef __ASSEMBLY__ + +#include <linux/bug.h> +#include <asm/cputable.h> + +static inline bool early_cpu_has_feature(unsigned long feature) +{ + return !!((CPU_FTRS_ALWAYS & feature) || + (CPU_FTRS_POSSIBLE & cur_cpu_spec->cpu_features & feature)); +} + +#ifdef CONFIG_JUMP_LABEL_FEATURE_CHECKS +#include <linux/jump_label.h> + +#define NUM_CPU_FTR_KEYS 64 + +extern struct static_key_true cpu_feature_keys[NUM_CPU_FTR_KEYS]; + +static __always_inline bool cpu_has_feature(unsigned long feature) +{ + int i; + + BUILD_BUG_ON(!__builtin_constant_p(feature)); + +#ifdef CONFIG_JUMP_LABEL_FEATURE_CHECK_DEBUG + if (!static_key_initialized) { + printk("Warning! cpu_has_feature() used prior to jump label init!\n"); + dump_stack(); + return early_cpu_has_feature(feature); + } +#endif + + if (CPU_FTRS_ALWAYS & feature) + return true; + + if (!(CPU_FTRS_POSSIBLE & feature)) + return false; + + i = __builtin_ctzl(feature); + return static_branch_likely(&cpu_feature_keys[i]); +} +#else +static inline bool cpu_has_feature(unsigned long feature) +{ + return early_cpu_has_feature(feature); +} +#endif + +#endif /* __ASSEMBLY__ */ +#endif /* __ASM_POWERPC_CPUFEATURE_H */ diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index df4fb5faba43..82026b419341 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -2,6 +2,7 @@ #define __ASM_POWERPC_CPUTABLE_H +#include <linux/types.h> #include <asm/asm-compat.h> #include <asm/feature-fixups.h> #include <uapi/asm/cputable.h> @@ -122,6 +123,12 @@ extern void do_feature_fixups(unsigned long value, void *fixup_start, extern const char *powerpc_base_platform; +#ifdef CONFIG_JUMP_LABEL_FEATURE_CHECKS +extern void cpu_feature_keys_init(void); +#else +static inline void cpu_feature_keys_init(void) { } +#endif + /* TLB flush actions. Used as argument to cpu_spec.flush_tlb() hook */ enum { TLB_INVAL_SCOPE_GLOBAL = 0, /* invalidate all TLBs */ @@ -576,14 +583,6 @@ enum { }; #endif /* __powerpc64__ */ -static inline int cpu_has_feature(unsigned long feature) -{ - return (CPU_FTRS_ALWAYS & feature) || - (CPU_FTRS_POSSIBLE - & cur_cpu_spec->cpu_features - & feature); -} - #define HBP_NUM 1 #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 2dfd4fc41f3e..4f60db074725 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -28,6 +28,7 @@ static inline void setup_cputime_one_jiffy(void) { } #include <asm/div64.h> #include <asm/time.h> #include <asm/param.h> +#include <asm/cpu_has_feature.h> typedef u64 __nocast cputime_t; typedef u64 __nocast cputime64_t; diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h index 5fa6b20eba10..378167377065 100644 --- a/arch/powerpc/include/asm/dbell.h +++ b/arch/powerpc/include/asm/dbell.h @@ -16,6 +16,7 @@ #include <linux/threads.h> #include <asm/ppc-opcode.h> +#include <asm/cpu_has_feature.h> #define PPC_DBELL_MSG_BRDCAST (0x04000000) #define PPC_DBELL_TYPE(x) (((x) & 0xf) << (63-36)) diff --git a/arch/powerpc/include/asm/dcr-native.h b/arch/powerpc/include/asm/dcr-native.h index 4efc11dacb98..4a2beef74277 100644 --- a/arch/powerpc/include/asm/dcr-native.h +++ b/arch/powerpc/include/asm/dcr-native.h @@ -24,6 +24,7 @@ #include <linux/spinlock.h> #include <asm/cputable.h> +#include <asm/cpu_has_feature.h> typedef struct { unsigned int base; diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index e2d9f4996e5c..c5517f463ec7 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -147,7 +147,7 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { pte_t pte; pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); - flush_tlb_page(vma, addr); + flush_hugetlb_page(vma, addr); } static inline int huge_pte_none(pte_t pte) diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h index 9af103a23975..9a287e0ac8b1 100644 --- a/arch/powerpc/include/asm/jump_label.h +++ b/arch/powerpc/include/asm/jump_label.h @@ -22,7 +22,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { asm_volatile_goto("1:\n\t" - "nop\n\t" + "nop # arch_static_branch\n\t" ".pushsection __jump_table, \"aw\"\n\t" JUMP_ENTRY_TYPE "1b, %l[l_yes], %c0\n\t" ".popsection \n\t" @@ -36,7 +36,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { asm_volatile_goto("1:\n\t" - "b %l[l_yes]\n\t" + "b %l[l_yes] # arch_static_branch_jump\n\t" ".pushsection __jump_table, \"aw\"\n\t" JUMP_ENTRY_TYPE "1b, %l[l_yes], %c0\n\t" ".popsection \n\t" diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 1f4497fb5b83..88d17b4ea9c8 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -181,8 +181,7 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, switch (b_psize) { case MMU_PAGE_4K: - sllp = ((mmu_psize_defs[a_psize].sllp & SLB_VSID_L) >> 6) | - ((mmu_psize_defs[a_psize].sllp & SLB_VSID_LP) >> 4); + sllp = get_sllp_encoding(a_psize); rb |= sllp << 5; /* AP field */ rb |= (va_low & 0x7ff) << 12; /* remaining 11 bits of AVA */ break; diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 76f5398e7152..0420b388dd83 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -219,8 +219,6 @@ struct machdep_calls { #ifdef CONFIG_ARCH_RANDOM int (*get_random_seed)(unsigned long *v); #endif - int (*register_process_table)(unsigned long base, unsigned long page_size, - unsigned long tbl_size); }; extern void e500_idle(void); diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h index fc420cedecae..30922f699341 100644 --- a/arch/powerpc/include/asm/mman.h +++ b/arch/powerpc/include/asm/mman.h @@ -13,6 +13,7 @@ #include <asm/cputable.h> #include <linux/mm.h> +#include <asm/cpu_has_feature.h> /* * This file is included by linux/mman.h, so we can't use cacl_vm_prot_bits() diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 54471228f7b8..e2fb408f8398 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -12,7 +12,7 @@ */ /* - * First half is MMU families + * MMU families */ #define MMU_FTR_HPTE_TABLE ASM_CONST(0x00000001) #define MMU_FTR_TYPE_8xx ASM_CONST(0x00000002) @@ -21,9 +21,13 @@ #define MMU_FTR_TYPE_FSL_E ASM_CONST(0x00000010) #define MMU_FTR_TYPE_47x ASM_CONST(0x00000020) +/* Radix page table supported and enabled */ +#define MMU_FTR_TYPE_RADIX ASM_CONST(0x00000040) + /* - * This is individual features + * Individual features below. */ + /* * We need to clear top 16bits of va (from the remaining 64 bits )in * tlbie* instructions @@ -93,11 +97,6 @@ */ #define MMU_FTR_1T_SEGMENT ASM_CONST(0x40000000) -/* - * Radix page table available - */ -#define MMU_FTR_RADIX ASM_CONST(0x80000000) - /* MMU feature bit sets for various CPUs */ #define MMU_FTRS_DEFAULT_HPTE_ARCH_V2 \ MMU_FTR_HPTE_TABLE | MMU_FTR_PPCAS_ARCH_V2 @@ -113,6 +112,7 @@ #define MMU_FTRS_PA6T MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \ MMU_FTR_CI_LARGE_PAGE | MMU_FTR_NO_SLBIE_B #ifndef __ASSEMBLY__ +#include <linux/bug.h> #include <asm/cputable.h> #ifdef CONFIG_PPC_FSL_BOOK3E @@ -131,20 +131,71 @@ enum { MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE | MMU_FTR_1T_SEGMENT | MMU_FTR_TLBIE_CROP_VA | #ifdef CONFIG_PPC_RADIX_MMU - MMU_FTR_RADIX | + MMU_FTR_TYPE_RADIX | #endif 0, }; -static inline int mmu_has_feature(unsigned long feature) +static inline bool early_mmu_has_feature(unsigned long feature) { - return (MMU_FTRS_POSSIBLE & cur_cpu_spec->mmu_features & feature); + return !!(MMU_FTRS_POSSIBLE & cur_cpu_spec->mmu_features & feature); +} + +#ifdef CONFIG_JUMP_LABEL_FEATURE_CHECKS +#include <linux/jump_label.h> + +#define NUM_MMU_FTR_KEYS 32 + +extern struct static_key_true mmu_feature_keys[NUM_MMU_FTR_KEYS]; + +extern void mmu_feature_keys_init(void); + +static __always_inline bool mmu_has_feature(unsigned long feature) +{ + int i; + + BUILD_BUG_ON(!__builtin_constant_p(feature)); + +#ifdef CONFIG_JUMP_LABEL_FEATURE_CHECK_DEBUG + if (!static_key_initialized) { + printk("Warning! mmu_has_feature() used prior to jump label init!\n"); + dump_stack(); + return early_mmu_has_feature(feature); + } +#endif + + if (!(MMU_FTRS_POSSIBLE & feature)) + return false; + + i = __builtin_ctzl(feature); + return static_branch_likely(&mmu_feature_keys[i]); } static inline void mmu_clear_feature(unsigned long feature) { + int i; + + i = __builtin_ctzl(feature); cur_cpu_spec->mmu_features &= ~feature; + static_branch_disable(&mmu_feature_keys[i]); } +#else + +static inline void mmu_feature_keys_init(void) +{ + +} + +static inline bool mmu_has_feature(unsigned long feature) +{ + return early_mmu_has_feature(feature); +} + +static inline void mmu_clear_feature(unsigned long feature) +{ + cur_cpu_spec->mmu_features &= ~feature; +} +#endif /* CONFIG_JUMP_LABEL */ extern unsigned int __start___mmu_ftr_fixup, __stop___mmu_ftr_fixup; @@ -164,6 +215,28 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr) } #endif /* !CONFIG_DEBUG_VM */ +#ifdef CONFIG_PPC_RADIX_MMU +static inline bool radix_enabled(void) +{ + return mmu_has_feature(MMU_FTR_TYPE_RADIX); +} + +static inline bool early_radix_enabled(void) +{ + return early_mmu_has_feature(MMU_FTR_TYPE_RADIX); +} +#else +static inline bool radix_enabled(void) +{ + return false; +} + +static inline bool early_radix_enabled(void) +{ + return false; +} +#endif + #endif /* !__ASSEMBLY__ */ /* The kernel use the constants below to index in the page sizes array. @@ -210,6 +283,7 @@ extern void early_init_mmu(void); extern void early_init_mmu_secondary(void); extern void setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size); +static inline void mmu_early_init_devtree(void) { } #endif /* __ASSEMBLY__ */ #endif @@ -230,9 +304,5 @@ extern void setup_initial_memory_limit(phys_addr_t first_memblock_base, # include <asm/mmu-8xx.h> #endif -#ifndef radix_enabled -#define radix_enabled() (0) -#endif - #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_MMU_H_ */ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 40f3615bf940..f69f40f1519a 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1256,15 +1256,6 @@ static inline void msr_check_and_clear(unsigned long bits) __msr_check_and_clear(bits); } -static inline unsigned long mfvtb (void) -{ -#ifdef CONFIG_PPC_BOOK3S_64 - if (cpu_has_feature(CPU_FTR_ARCH_207S)) - return mfspr(SPRN_VTB); -#endif - return 0; -} - #ifdef __powerpc64__ #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E) #define mftb() ({unsigned long rval; \ diff --git a/arch/powerpc/include/asm/rtc.h b/arch/powerpc/include/asm/rtc.h deleted file mode 100644 index f5802926b6c0..000000000000 --- a/arch/powerpc/include/asm/rtc.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Real-time clock definitions and interfaces - * - * Author: Tom Rini <trini@mvista.com> - * - * 2002 (c) MontaVista, Software, Inc. This file is licensed under - * the terms of the GNU General Public License version 2. This program - * is licensed "as is" without any warranty of any kind, whether express - * or implied. - * - * Based on: - * include/asm-m68k/rtc.h - * - * Copyright Richard Zidlicky - * implementation details for genrtc/q40rtc driver - * - * And the old drivers/macintosh/rtc.c which was heavily based on: - * Linux/SPARC Real Time Clock Driver - * Copyright (C) 1996 Thomas K. Dyas (tdyas@eden.rutgers.edu) - * - * With additional work by Paul Mackerras and Franz Sirl. - */ - -#ifndef __ASM_POWERPC_RTC_H__ -#define __ASM_POWERPC_RTC_H__ - -#ifdef __KERNEL__ - -#include <linux/rtc.h> - -#include <asm/machdep.h> -#include <asm/time.h> - -#define RTC_PIE 0x40 /* periodic interrupt enable */ -#define RTC_AIE 0x20 /* alarm interrupt enable */ -#define RTC_UIE 0x10 /* update-finished interrupt enable */ - -/* some dummy definitions */ -#define RTC_BATT_BAD 0x100 /* battery bad */ -#define RTC_SQWE 0x08 /* enable square-wave output */ -#define RTC_DM_BINARY 0x04 /* all time/date values are BCD if clear */ -#define RTC_24H 0x02 /* 24 hour mode - else hours bit 7 means pm */ -#define RTC_DST_EN 0x01 /* auto switch DST - works f. USA only */ - -static inline unsigned int get_rtc_time(struct rtc_time *time) -{ - if (ppc_md.get_rtc_time) - ppc_md.get_rtc_time(time); - return RTC_24H; -} - -/* Set the current date and time in the real time clock. */ -static inline int set_rtc_time(struct rtc_time *time) -{ - if (ppc_md.set_rtc_time) - return ppc_md.set_rtc_time(time); - return -EINVAL; -} - -static inline unsigned int get_rtc_ss(void) -{ - struct rtc_time h; - - get_rtc_time(&h); - return h.tm_sec; -} - -static inline int get_rtc_pll(struct rtc_pll_info *pll) -{ - return -EINVAL; -} -static inline int set_rtc_pll(struct rtc_pll_info *pll) -{ - return -EINVAL; -} - -#endif /* __KERNEL__ */ -#endif /* __ASM_POWERPC_RTC_H__ */ diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index 17c8380673a6..0a74ebe934e1 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -75,6 +75,14 @@ static inline void disable_kernel_spe(void) static inline void __giveup_spe(struct task_struct *t) { } #endif +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +extern void flush_tmregs_to_thread(struct task_struct *); +#else +static inline void flush_tmregs_to_thread(struct task_struct *t) +{ +} +#endif + static inline void clear_task_ebb(struct task_struct *t) { #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 09211640a0e0..b240666b7bc1 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -18,6 +18,7 @@ #include <linux/percpu.h> #include <asm/processor.h> +#include <asm/cpu_has_feature.h> /* time.c */ extern unsigned long tb_ticks_per_jiffy; @@ -103,7 +104,7 @@ static inline u64 get_vtb(void) { #ifdef CONFIG_PPC_BOOK3S_64 if (cpu_has_feature(CPU_FTR_ARCH_207S)) - return mfvtb(); + return mfspr(SPRN_VTB); #endif return 0; } diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index 20733fa518ae..f6f68f73e858 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -46,5 +46,18 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, #endif } +#ifdef CONFIG_SMP +static inline int mm_is_core_local(struct mm_struct *mm) +{ + return cpumask_subset(mm_cpumask(mm), + topology_sibling_cpumask(smp_processor_id())); +} +#else +static inline int mm_is_core_local(struct mm_struct *mm) +{ + return 1; +} +#endif + #endif /* __KERNEL__ */ #endif /* __ASM_POWERPC_TLB_H */ diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h index 1b38eea28e5a..13dbcd41885e 100644 --- a/arch/powerpc/include/asm/tlbflush.h +++ b/arch/powerpc/include/asm/tlbflush.h @@ -54,7 +54,6 @@ extern void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, #define flush_tlb_page(vma,addr) local_flush_tlb_page(vma,addr) #define __flush_tlb_page(mm,addr,p,i) __local_flush_tlb_page(mm,addr,p,i) #endif -#define flush_tlb_page_nohash(vma,addr) flush_tlb_page(vma,addr) #elif defined(CONFIG_PPC_STD_MMU_32) diff --git a/arch/powerpc/include/asm/xor.h b/arch/powerpc/include/asm/xor.h index 0abb97f3be10..a36c2069d8ed 100644 --- a/arch/powerpc/include/asm/xor.h +++ b/arch/powerpc/include/asm/xor.h @@ -23,6 +23,7 @@ #ifdef CONFIG_ALTIVEC #include <asm/cputable.h> +#include <asm/cpu_has_feature.h> void xor_altivec_2(unsigned long bytes, unsigned long *v1_in, unsigned long *v2_in); diff --git a/arch/powerpc/include/uapi/asm/elf.h b/arch/powerpc/include/uapi/asm/elf.h index c2d21d11c2d2..3a9e44c45c78 100644 --- a/arch/powerpc/include/uapi/asm/elf.h +++ b/arch/powerpc/include/uapi/asm/elf.h @@ -91,6 +91,11 @@ #define ELF_NGREG 48 /* includes nip, msr, lr, etc. */ #define ELF_NFPREG 33 /* includes fpscr */ +#define ELF_NVMX 34 /* includes all vector registers */ +#define ELF_NVSX 32 /* includes all VSX registers */ +#define ELF_NTMSPRREG 3 /* include tfhar, tfiar, texasr */ +#define ELF_NEBB 3 /* includes ebbrr, ebbhr, bescr */ +#define ELF_NPMU 5 /* includes siar, sdar, sier, mmcr2, mmcr0 */ typedef unsigned long elf_greg_t64; typedef elf_greg_t64 elf_gregset_t64[ELF_NGREG]; diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c index c7097f933114..033f3385fa49 100644 --- a/arch/powerpc/kernel/align.c +++ b/arch/powerpc/kernel/align.c @@ -26,6 +26,7 @@ #include <asm/emulated_ops.h> #include <asm/switch_to.h> #include <asm/disassemble.h> +#include <asm/cpu_has_feature.h> struct aligninfo { unsigned char len; diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index d81f826d1029..74248ab18e98 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -15,6 +15,7 @@ #include <linux/threads.h> #include <linux/init.h> #include <linux/export.h> +#include <linux/jump_label.h> #include <asm/oprofile_impl.h> #include <asm/cputable.h> @@ -2224,3 +2225,39 @@ struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr) return NULL; } + +#ifdef CONFIG_JUMP_LABEL_FEATURE_CHECKS +struct static_key_true cpu_feature_keys[NUM_CPU_FTR_KEYS] = { + [0 ... NUM_CPU_FTR_KEYS - 1] = STATIC_KEY_TRUE_INIT +}; +EXPORT_SYMBOL_GPL(cpu_feature_keys); + +void __init cpu_feature_keys_init(void) +{ + int i; + + for (i = 0; i < NUM_CPU_FTR_KEYS; i++) { + unsigned long f = 1ul << i; + + if (!(cur_cpu_spec->cpu_features & f)) + static_branch_disable(&cpu_feature_keys[i]); + } +} + +struct static_key_true mmu_feature_keys[NUM_MMU_FTR_KEYS] = { + [0 ... NUM_MMU_FTR_KEYS - 1] = STATIC_KEY_TRUE_INIT +}; +EXPORT_SYMBOL_GPL(mmu_feature_keys); + +void __init mmu_feature_keys_init(void) +{ + int i; + + for (i = 0; i < NUM_MMU_FTR_KEYS; i++) { + unsigned long f = 1ul << i; + + if (!(cur_cpu_spec->mmu_features & f)) + static_branch_disable(&mmu_feature_keys[i]); + } +} +#endif diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index fcb2887f5a33..6b8bc0dd09d4 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -532,7 +532,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) #ifdef CONFIG_PPC_STD_MMU_64 BEGIN_MMU_FTR_SECTION b 2f -END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX) +END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) BEGIN_FTR_SECTION clrrdi r6,r8,28 /* get its ESID */ clrrdi r9,r1,28 /* get current sp ESID */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 694def6c9d61..41091fdf9bd8 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -940,7 +940,7 @@ BEGIN_MMU_FTR_SECTION b do_hash_page /* Try to handle as hpte fault */ MMU_FTR_SECTION_ELSE b handle_page_fault -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX) +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) .align 7 .globl h_data_storage_common @@ -971,7 +971,7 @@ BEGIN_MMU_FTR_SECTION b do_hash_page /* Try to handle as hpte fault */ MMU_FTR_SECTION_ELSE b handle_page_fault -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX) +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception) @@ -1392,7 +1392,7 @@ slb_miss_realmode: #ifdef CONFIG_PPC_STD_MMU_64 BEGIN_MMU_FTR_SECTION bl slb_allocate_realmode -END_MMU_FTR_SECTION_IFCLR(MMU_FTR_RADIX) +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) #endif /* All done -- return from exception. */ @@ -1406,7 +1406,7 @@ BEGIN_MMU_FTR_SECTION beq- 2f FTR_SECTION_ELSE b 2f -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX) +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) .machine push .machine "power4" diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 8a56a51fc0cb..ba79d15f4ddd 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -572,7 +572,7 @@ common_exit: BEGIN_MMU_FTR_SECTION b no_segments -END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX) +END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) /* Restore SLB from PACA */ ld r8,PACA_SLBSHADOWPTR(r13) diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index ac910d9982df..08887cf2b20e 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -75,6 +75,7 @@ #endif #define CREATE_TRACE_POINTS #include <asm/trace.h> +#include <asm/cpu_has_feature.h> DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); EXPORT_PER_CPU_SYMBOL(irq_stat); diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 93dae296b6be..fa20060ff7a5 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -184,7 +184,7 @@ void setup_paca(struct paca_struct *new_paca) * if we do a GET_PACA() before the feature fixups have been * applied */ - if (cpu_has_feature(CPU_FTR_HVMODE)) + if (early_cpu_has_feature(CPU_FTR_HVMODE)) mtspr(SPRN_SPRG_HPACA, local_paca); #endif mtspr(SPRN_SPRG_PACA, local_paca); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index a8cca88e972f..58ccf86415b4 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -58,6 +58,7 @@ #include <asm/code-patching.h> #include <asm/exec.h> #include <asm/livepatch.h> +#include <asm/cpu_has_feature.h> #include <linux/kprobes.h> #include <linux/kdebug.h> @@ -1073,6 +1074,26 @@ static inline void restore_sprs(struct thread_struct *old_thread, #endif } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +void flush_tmregs_to_thread(struct task_struct *tsk) +{ + /* + * Process self tracing is not yet supported through + * ptrace interface. Ptrace generic code should have + * prevented this from happening in the first place. + * Warn once here with the message, if some how it + * is attempted. + */ + WARN_ONCE(tsk == current, + "Not expecting ptrace on self: TM regs may be incorrect\n"); + + /* + * If task is not current, it should have been flushed + * already to it's thread_struct during __switch_to(). + */ +} +#endif + struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *new) { diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index bae3db791150..b0245bed6f54 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -170,7 +170,7 @@ static struct ibm_pa_feature { */ {CPU_FTR_TM_COMP, 0, 0, PPC_FEATURE2_HTM_COMP|PPC_FEATURE2_HTM_NOSC_COMP, 22, 0, 0}, - {0, MMU_FTR_RADIX, 0, 0, 40, 0, 0}, + {0, MMU_FTR_TYPE_RADIX, 0, 0, 40, 0, 0}, }; static void __init scan_features(unsigned long node, const unsigned char *ftrs, @@ -647,14 +647,6 @@ static void __init early_reserve_mem(void) #endif } -static bool disable_radix; -static int __init parse_disable_radix(char *p) -{ - disable_radix = true; - return 0; -} -early_param("disable_radix", parse_disable_radix); - void __init early_init_devtree(void *params) { phys_addr_t limit; @@ -744,11 +736,8 @@ void __init early_init_devtree(void *params) */ spinning_secondaries = boot_cpu_count - 1; #endif - /* - * now fixup radix MMU mode based on kernel command line - */ - if (disable_radix) - cur_cpu_spec->mmu_features &= ~MMU_FTR_RADIX; + + mmu_early_init_devtree(); #ifdef CONFIG_PPC_POWERNV /* Scan and build the list of machine check recoverable ranges */ diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 134bee9ac664..4f3c5756cc09 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -64,6 +64,10 @@ struct pt_regs_offset { {.name = STR(gpr##num), .offset = offsetof(struct pt_regs, gpr[num])} #define REG_OFFSET_END {.name = NULL, .offset = 0} +#define TVSO(f) (offsetof(struct thread_vr_state, f)) +#define TFSO(f) (offsetof(struct thread_fp_state, f)) +#define TSO(f) (offsetof(struct thread_struct, f)) + static const struct pt_regs_offset regoffset_table[] = { GPR_OFFSET_NAME(0), GPR_OFFSET_NAME(1), @@ -181,6 +185,26 @@ static int set_user_msr(struct task_struct *task, unsigned long msr) return 0; } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +static unsigned long get_user_ckpt_msr(struct task_struct *task) +{ + return task->thread.ckpt_regs.msr | task->thread.fpexc_mode; +} + +static int set_user_ckpt_msr(struct task_struct *task, unsigned long msr) +{ + task->thread.ckpt_regs.msr &= ~MSR_DEBUGCHANGE; + task->thread.ckpt_regs.msr |= msr & MSR_DEBUGCHANGE; + return 0; +} + +static int set_user_ckpt_trap(struct task_struct *task, unsigned long trap) +{ + task->thread.ckpt_regs.trap = trap & 0xfff0; + return 0; +} +#endif + #ifdef CONFIG_PPC64 static int get_user_dscr(struct task_struct *task, unsigned long *data) { @@ -358,6 +382,29 @@ static int gpr_set(struct task_struct *target, const struct user_regset *regset, return ret; } +/* + * When the transaction is active, 'transact_fp' holds the current running + * value of all FPR registers and 'fp_state' holds the last checkpointed + * value of all FPR registers for the current transaction. When transaction + * is not active 'fp_state' holds the current running state of all the FPR + * registers. So this function which returns the current running values of + * all the FPR registers, needs to know whether any transaction is active + * or not. + * + * Userspace interface buffer layout: + * + * struct data { + * u64 fpr[32]; + * u64 fpscr; + * }; + * + * There are two config options CONFIG_VSX and CONFIG_PPC_TRANSACTIONAL_MEM + * which determines the final code in this function. All the combinations of + * these two config options are possible except the one below as transactional + * memory config pulls in CONFIG_VSX automatically. + * + * !defined(CONFIG_VSX) && defined(CONFIG_PPC_TRANSACTIONAL_MEM) + */ static int fpr_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) @@ -368,14 +415,31 @@ static int fpr_get(struct task_struct *target, const struct user_regset *regset, #endif flush_fp_to_thread(target); -#ifdef CONFIG_VSX +#if defined(CONFIG_VSX) && defined(CONFIG_PPC_TRANSACTIONAL_MEM) + /* copy to local buffer then write that out */ + if (MSR_TM_ACTIVE(target->thread.regs->msr)) { + flush_altivec_to_thread(target); + flush_tmregs_to_thread(target); + for (i = 0; i < 32 ; i++) + buf[i] = target->thread.TS_TRANS_FPR(i); + buf[32] = target->thread.transact_fp.fpscr; + } else { + for (i = 0; i < 32 ; i++) + buf[i] = target->thread.TS_FPR(i); + buf[32] = target->thread.fp_state.fpscr; + } + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, buf, 0, -1); +#endif + +#if defined(CONFIG_VSX) && !defined(CONFIG_PPC_TRANSACTIONAL_MEM) /* copy to local buffer then write that out */ for (i = 0; i < 32 ; i++) buf[i] = target->thread.TS_FPR(i); buf[32] = target->thread.fp_state.fpscr; return user_regset_copyout(&pos, &count, &kbuf, &ubuf, buf, 0, -1); +#endif -#else +#if !defined(CONFIG_VSX) && !defined(CONFIG_PPC_TRANSACTIONAL_MEM) BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) != offsetof(struct thread_fp_state, fpr[32])); @@ -384,6 +448,29 @@ static int fpr_get(struct task_struct *target, const struct user_regset *regset, #endif } +/* + * When the transaction is active, 'transact_fp' holds the current running + * value of all FPR registers and 'fp_state' holds the last checkpointed + * value of all FPR registers for the current transaction. When transaction + * is not active 'fp_state' holds the current running state of all the FPR + * registers. So this function which setss the current running values of + * all the FPR registers, needs to know whether any transaction is active + * or not. + * + * Userspace interface buffer layout: + * + * struct data { + * u64 fpr[32]; + * u64 fpscr; + * }; + * + * There are two config options CONFIG_VSX and CONFIG_PPC_TRANSACTIONAL_MEM + * which determines the final code in this function. All the combinations of + * these two config options are possible except the one below as transactional + * memory config pulls in CONFIG_VSX automatically. + * + * !defined(CONFIG_VSX) && defined(CONFIG_PPC_TRANSACTIONAL_MEM) + */ static int fpr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) @@ -394,7 +481,27 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset, #endif flush_fp_to_thread(target); -#ifdef CONFIG_VSX +#if defined(CONFIG_VSX) && defined(CONFIG_PPC_TRANSACTIONAL_MEM) + /* copy to local buffer then write that out */ + i = user_regset_copyin(&pos, &count, &kbuf, &ubuf, buf, 0, -1); + if (i) + return i; + + if (MSR_TM_ACTIVE(target->thread.regs->msr)) { + flush_altivec_to_thread(target); + flush_tmregs_to_thread(target); + for (i = 0; i < 32 ; i++) + target->thread.TS_TRANS_FPR(i) = buf[i]; + target->thread.transact_fp.fpscr = buf[32]; + } else { + for (i = 0; i < 32 ; i++) + target->thread.TS_FPR(i) = buf[i]; + target->thread.fp_state.fpscr = buf[32]; + } + return 0; +#endif + +#if defined(CONFIG_VSX) && !defined(CONFIG_PPC_TRANSACTIONAL_MEM) /* copy to local buffer then write that out */ i = user_regset_copyin(&pos, &count, &kbuf, &ubuf, buf, 0, -1); if (i) @@ -403,7 +510,9 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset, target->thread.TS_FPR(i) = buf[i]; target->thread.fp_state.fpscr = buf[32]; return 0; -#else +#endif + +#if !defined(CONFIG_VSX) && !defined(CONFIG_PPC_TRANSACTIONAL_MEM) BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) != offsetof(struct thread_fp_state, fpr[32])); @@ -433,10 +542,28 @@ static int vr_active(struct task_struct *target, return target->thread.used_vr ? regset->n : 0; } +/* + * When the transaction is active, 'transact_vr' holds the current running + * value of all the VMX registers and 'vr_state' holds the last checkpointed + * value of all the VMX registers for the current transaction to fall back + * on in case it aborts. When transaction is not active 'vr_state' holds + * the current running state of all the VMX registers. So this function which + * gets the current running values of all the VMX registers, needs to know + * whether any transaction is active or not. + * + * Userspace interface buffer layout: + * + * struct data { + * vector128 vr[32]; + * vector128 vscr; + * vector128 vrsave; + * }; + */ static int vr_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { + struct thread_vr_state *addr; int ret; flush_altivec_to_thread(target); @@ -444,8 +571,19 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset, BUILD_BUG_ON(offsetof(struct thread_vr_state, vscr) != offsetof(struct thread_vr_state, vr[32])); +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (MSR_TM_ACTIVE(target->thread.regs->msr)) { + flush_fp_to_thread(target); + flush_tmregs_to_thread(target); + addr = &target->thread.transact_vr; + } else { + addr = &target->thread.vr_state; + } +#else + addr = &target->thread.vr_state; +#endif ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.vr_state, 0, + addr, 0, 33 * sizeof(vector128)); if (!ret) { /* @@ -456,7 +594,16 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset, u32 word; } vrsave; memset(&vrsave, 0, sizeof(vrsave)); + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (MSR_TM_ACTIVE(target->thread.regs->msr)) + vrsave.word = target->thread.transact_vrsave; + else + vrsave.word = target->thread.vrsave; +#else vrsave.word = target->thread.vrsave; +#endif + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &vrsave, 33 * sizeof(vector128), -1); } @@ -464,10 +611,28 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset, return ret; } +/* + * When the transaction is active, 'transact_vr' holds the current running + * value of all the VMX registers and 'vr_state' holds the last checkpointed + * value of all the VMX registers for the current transaction to fall back + * on in case it aborts. When transaction is not active 'vr_state' holds + * the current running state of all the VMX registers. So this function which + * sets the current running values of all the VMX registers, needs to know + * whether any transaction is active or not. + * + * Userspace interface buffer layout: + * + * struct data { + * vector128 vr[32]; + * vector128 vscr; + * vector128 vrsave; + * }; + */ static int vr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { + struct thread_vr_state *addr; int ret; flush_altivec_to_thread(target); @@ -475,8 +640,19 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset, BUILD_BUG_ON(offsetof(struct thread_vr_state, vscr) != offsetof(struct thread_vr_state, vr[32])); +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (MSR_TM_ACTIVE(target->thread.regs->msr)) { + flush_fp_to_thread(target); + flush_tmregs_to_thread(target); + addr = &target->thread.transact_vr; + } else { + addr = &target->thread.vr_state; + } +#else + addr = &target->thread.vr_state; +#endif ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.vr_state, 0, + addr, 0, 33 * sizeof(vector128)); if (!ret && count > 0) { /* @@ -487,11 +663,28 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset, u32 word; } vrsave; memset(&vrsave, 0, sizeof(vrsave)); + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (MSR_TM_ACTIVE(target->thread.regs->msr)) + vrsave.word = target->thread.transact_vrsave; + else + vrsave.word = target->thread.vrsave; +#else vrsave.word = target->thread.vrsave; +#endif ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave, 33 * sizeof(vector128), -1); - if (!ret) + if (!ret) { + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (MSR_TM_ACTIVE(target->thread.regs->msr)) + target->thread.transact_vrsave = vrsave.word; + else + target->thread.vrsave = vrsave.word; +#else target->thread.vrsave = vrsave.word; +#endif + } } return ret; @@ -512,6 +705,21 @@ static int vsr_active(struct task_struct *target, return target->thread.used_vsr ? regset->n : 0; } +/* + * When the transaction is active, 'transact_fp' holds the current running + * value of all FPR registers and 'fp_state' holds the last checkpointed + * value of all FPR registers for the current transaction. When transaction + * is not active 'fp_state' holds the current running state of all the FPR + * registers. So this function which returns the current running values of + * all the FPR registers, needs to know whether any transaction is active + * or not. + * + * Userspace interface buffer layout: + * + * struct data { + * u64 vsx[32]; + * }; + */ static int vsr_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) @@ -519,16 +727,47 @@ static int vsr_get(struct task_struct *target, const struct user_regset *regset, u64 buf[32]; int ret, i; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + flush_tmregs_to_thread(target); +#endif flush_vsx_to_thread(target); +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (MSR_TM_ACTIVE(target->thread.regs->msr)) { + for (i = 0; i < 32 ; i++) + buf[i] = target->thread. + transact_fp.fpr[i][TS_VSRLOWOFFSET]; + } else { + for (i = 0; i < 32 ; i++) + buf[i] = target->thread. + fp_state.fpr[i][TS_VSRLOWOFFSET]; + } +#else for (i = 0; i < 32 ; i++) buf[i] = target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET]; +#endif ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, buf, 0, 32 * sizeof(double)); return ret; } +/* + * When the transaction is active, 'transact_fp' holds the current running + * value of all FPR registers and 'fp_state' holds the last checkpointed + * value of all FPR registers for the current transaction. When transaction + * is not active 'fp_state' holds the current running state of all the FPR + * registers. So this function which sets the current running values of all + * the FPR registers, needs to know whether any transaction is active or not. + * + * Userspace interface buffer layout: + * + * struct data { + * u64 vsx[32]; + * }; + */ static int vsr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) @@ -536,12 +775,30 @@ static int vsr_set(struct task_struct *target, const struct user_regset *regset, u64 buf[32]; int ret,i; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + flush_tmregs_to_thread(target); +#endif flush_vsx_to_thread(target); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, buf, 0, 32 * sizeof(double)); + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (MSR_TM_ACTIVE(target->thread.regs->msr)) { + for (i = 0; i < 32 ; i++) + target->thread.transact_fp. + fpr[i][TS_VSRLOWOFFSET] = buf[i]; + } else { + for (i = 0; i < 32 ; i++) + target->thread.fp_state. + fpr[i][TS_VSRLOWOFFSET] = buf[i]; + } +#else for (i = 0; i < 32 ; i++) target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; +#endif return ret; @@ -614,8 +871,1030 @@ static int evr_set(struct task_struct *target, const struct user_regset *regset, } #endif /* CONFIG_SPE */ +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/** + * tm_cgpr_active - get active number of registers in CGPR + * @target: The target task. + * @regset: The user regset structure. + * + * This function checks for the active number of available + * regisers in transaction checkpointed GPR category. + */ +static int tm_cgpr_active(struct task_struct *target, + const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return 0; + + return regset->n; +} + +/** + * tm_cgpr_get - get CGPR registers + * @target: The target task. + * @regset: The user regset structure. + * @pos: The buffer position. + * @count: Number of bytes to copy. + * @kbuf: Kernel buffer to copy from. + * @ubuf: User buffer to copy into. + * + * This function gets transaction checkpointed GPR registers. + * + * When the transaction is active, 'ckpt_regs' holds all the checkpointed + * GPR register values for the current transaction to fall back on if it + * aborts in between. This function gets those checkpointed GPR registers. + * The userspace interface buffer layout is as follows. + * + * struct data { + * struct pt_regs ckpt_regs; + * }; + */ +static int tm_cgpr_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + flush_tmregs_to_thread(target); + + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.ckpt_regs, + 0, offsetof(struct pt_regs, msr)); + if (!ret) { + unsigned long msr = get_user_ckpt_msr(target); + + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &msr, + offsetof(struct pt_regs, msr), + offsetof(struct pt_regs, msr) + + sizeof(msr)); + } + + BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) != + offsetof(struct pt_regs, msr) + sizeof(long)); + + if (!ret) + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.ckpt_regs.orig_gpr3, + offsetof(struct pt_regs, orig_gpr3), + sizeof(struct pt_regs)); + if (!ret) + ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, + sizeof(struct pt_regs), -1); + + return ret; +} /* + * tm_cgpr_set - set the CGPR registers + * @target: The target task. + * @regset: The user regset structure. + * @pos: The buffer position. + * @count: Number of bytes to copy. + * @kbuf: Kernel buffer to copy into. + * @ubuf: User buffer to copy from. + * + * This function sets in transaction checkpointed GPR registers. + * + * When the transaction is active, 'ckpt_regs' holds the checkpointed + * GPR register values for the current transaction to fall back on if it + * aborts in between. This function sets those checkpointed GPR registers. + * The userspace interface buffer layout is as follows. + * + * struct data { + * struct pt_regs ckpt_regs; + * }; + */ +static int tm_cgpr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + unsigned long reg; + int ret; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + flush_tmregs_to_thread(target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.ckpt_regs, + 0, PT_MSR * sizeof(reg)); + + if (!ret && count > 0) { + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, ®, + PT_MSR * sizeof(reg), + (PT_MSR + 1) * sizeof(reg)); + if (!ret) + ret = set_user_ckpt_msr(target, reg); + } + + BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) != + offsetof(struct pt_regs, msr) + sizeof(long)); + + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.ckpt_regs.orig_gpr3, + PT_ORIG_R3 * sizeof(reg), + (PT_MAX_PUT_REG + 1) * sizeof(reg)); + + if (PT_MAX_PUT_REG + 1 < PT_TRAP && !ret) + ret = user_regset_copyin_ignore( + &pos, &count, &kbuf, &ubuf, + (PT_MAX_PUT_REG + 1) * sizeof(reg), + PT_TRAP * sizeof(reg)); + + if (!ret && count > 0) { + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, ®, + PT_TRAP * sizeof(reg), + (PT_TRAP + 1) * sizeof(reg)); + if (!ret) + ret = set_user_ckpt_trap(target, reg); + } + + if (!ret) + ret = user_regset_copyin_ignore( + &pos, &count, &kbuf, &ubuf, + (PT_TRAP + 1) * sizeof(reg), -1); + + return ret; +} + +/** + * tm_cfpr_active - get active number of registers in CFPR + * @target: The target task. + * @regset: The user regset structure. + * + * This function checks for the active number of available + * regisers in transaction checkpointed FPR category. + */ +static int tm_cfpr_active(struct task_struct *target, + const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return 0; + + return regset->n; +} + +/** + * tm_cfpr_get - get CFPR registers + * @target: The target task. + * @regset: The user regset structure. + * @pos: The buffer position. + * @count: Number of bytes to copy. + * @kbuf: Kernel buffer to copy from. + * @ubuf: User buffer to copy into. + * + * This function gets in transaction checkpointed FPR registers. + * + * When the transaction is active 'fp_state' holds the checkpointed + * values for the current transaction to fall back on if it aborts + * in between. This function gets those checkpointed FPR registers. + * The userspace interface buffer layout is as follows. + * + * struct data { + * u64 fpr[32]; + * u64 fpscr; + *}; + */ +static int tm_cfpr_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + u64 buf[33]; + int i; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + flush_tmregs_to_thread(target); + + /* copy to local buffer then write that out */ + for (i = 0; i < 32 ; i++) + buf[i] = target->thread.TS_FPR(i); + buf[32] = target->thread.fp_state.fpscr; + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, buf, 0, -1); +} + +/** + * tm_cfpr_set - set CFPR registers + * @target: The target task. + * @regset: The user regset structure. + * @pos: The buffer position. + * @count: Number of bytes to copy. + * @kbuf: Kernel buffer to copy into. + * @ubuf: User buffer to copy from. + * + * This function sets in transaction checkpointed FPR registers. + * + * When the transaction is active 'fp_state' holds the checkpointed + * FPR register values for the current transaction to fall back on + * if it aborts in between. This function sets these checkpointed + * FPR registers. The userspace interface buffer layout is as follows. + * + * struct data { + * u64 fpr[32]; + * u64 fpscr; + *}; + */ +static int tm_cfpr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + u64 buf[33]; + int i; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + flush_tmregs_to_thread(target); + + /* copy to local buffer then write that out */ + i = user_regset_copyin(&pos, &count, &kbuf, &ubuf, buf, 0, -1); + if (i) + return i; + for (i = 0; i < 32 ; i++) + target->thread.TS_FPR(i) = buf[i]; + target->thread.fp_state.fpscr = buf[32]; + return 0; +} + +/** + * tm_cvmx_active - get active number of registers in CVMX + * @target: The target task. + * @regset: The user regset structure. + * + * This function checks for the active number of available + * regisers in checkpointed VMX category. + */ +static int tm_cvmx_active(struct task_struct *target, + const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return 0; + + return regset->n; +} + +/** + * tm_cvmx_get - get CMVX registers + * @target: The target task. + * @regset: The user regset structure. + * @pos: The buffer position. + * @count: Number of bytes to copy. + * @kbuf: Kernel buffer to copy from. + * @ubuf: User buffer to copy into. + * + * This function gets in transaction checkpointed VMX registers. + * + * When the transaction is active 'vr_state' and 'vr_save' hold + * the checkpointed values for the current transaction to fall + * back on if it aborts in between. The userspace interface buffer + * layout is as follows. + * + * struct data { + * vector128 vr[32]; + * vector128 vscr; + * vector128 vrsave; + *}; + */ +static int tm_cvmx_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + BUILD_BUG_ON(TVSO(vscr) != TVSO(vr[32])); + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + /* Flush the state */ + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + flush_tmregs_to_thread(target); + + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.vr_state, 0, + 33 * sizeof(vector128)); + if (!ret) { + /* + * Copy out only the low-order word of vrsave. + */ + union { + elf_vrreg_t reg; + u32 word; + } vrsave; + memset(&vrsave, 0, sizeof(vrsave)); + vrsave.word = target->thread.vrsave; + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &vrsave, + 33 * sizeof(vector128), -1); + } + + return ret; +} + +/** + * tm_cvmx_set - set CMVX registers + * @target: The target task. + * @regset: The user regset structure. + * @pos: The buffer position. + * @count: Number of bytes to copy. + * @kbuf: Kernel buffer to copy into. + * @ubuf: User buffer to copy from. + * + * This function sets in transaction checkpointed VMX registers. + * + * When the transaction is active 'vr_state' and 'vr_save' hold + * the checkpointed values for the current transaction to fall + * back on if it aborts in between. The userspace interface buffer + * layout is as follows. + * + * struct data { + * vector128 vr[32]; + * vector128 vscr; + * vector128 vrsave; + *}; + */ +static int tm_cvmx_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + BUILD_BUG_ON(TVSO(vscr) != TVSO(vr[32])); + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + flush_tmregs_to_thread(target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.vr_state, 0, + 33 * sizeof(vector128)); + if (!ret && count > 0) { + /* + * We use only the low-order word of vrsave. + */ + union { + elf_vrreg_t reg; + u32 word; + } vrsave; + memset(&vrsave, 0, sizeof(vrsave)); + vrsave.word = target->thread.vrsave; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave, + 33 * sizeof(vector128), -1); + if (!ret) + target->thread.vrsave = vrsave.word; + } + + return ret; +} + +/** + * tm_cvsx_active - get active number of registers in CVSX + * @target: The target task. + * @regset: The user regset structure. + * + * This function checks for the active number of available + * regisers in transaction checkpointed VSX category. + */ +static int tm_cvsx_active(struct task_struct *target, + const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return 0; + + flush_vsx_to_thread(target); + return target->thread.used_vsr ? regset->n : 0; +} + +/** + * tm_cvsx_get - get CVSX registers + * @target: The target task. + * @regset: The user regset structure. + * @pos: The buffer position. + * @count: Number of bytes to copy. + * @kbuf: Kernel buffer to copy from. + * @ubuf: User buffer to copy into. + * + * This function gets in transaction checkpointed VSX registers. + * + * When the transaction is active 'fp_state' holds the checkpointed + * values for the current transaction to fall back on if it aborts + * in between. This function gets those checkpointed VSX registers. + * The userspace interface buffer layout is as follows. + * + * struct data { + * u64 vsx[32]; + *}; + */ +static int tm_cvsx_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + u64 buf[32]; + int ret, i; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + /* Flush the state */ + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + flush_tmregs_to_thread(target); + flush_vsx_to_thread(target); + + for (i = 0; i < 32 ; i++) + buf[i] = target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET]; + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + buf, 0, 32 * sizeof(double)); + + return ret; +} + +/** + * tm_cvsx_set - set CFPR registers + * @target: The target task. + * @regset: The user regset structure. + * @pos: The buffer position. + * @count: Number of bytes to copy. + * @kbuf: Kernel buffer to copy into. + * @ubuf: User buffer to copy from. + * + * This function sets in transaction checkpointed VSX registers. + * + * When the transaction is active 'fp_state' holds the checkpointed + * VSX register values for the current transaction to fall back on + * if it aborts in between. This function sets these checkpointed + * FPR registers. The userspace interface buffer layout is as follows. + * + * struct data { + * u64 vsx[32]; + *}; + */ +static int tm_cvsx_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + u64 buf[32]; + int ret, i; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + /* Flush the state */ + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + flush_tmregs_to_thread(target); + flush_vsx_to_thread(target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + buf, 0, 32 * sizeof(double)); + for (i = 0; i < 32 ; i++) + target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; + + return ret; +} + +/** + * tm_spr_active - get active number of registers in TM SPR + * @target: The target task. + * @regset: The user regset structure. + * + * This function checks the active number of available + * regisers in the transactional memory SPR category. + */ +static int tm_spr_active(struct task_struct *target, + const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + return regset->n; +} + +/** + * tm_spr_get - get the TM related SPR registers + * @target: The target task. + * @regset: The user regset structure. + * @pos: The buffer position. + * @count: Number of bytes to copy. + * @kbuf: Kernel buffer to copy from. + * @ubuf: User buffer to copy into. + * + * This function gets transactional memory related SPR registers. + * The userspace interface buffer layout is as follows. + * + * struct { + * u64 tm_tfhar; + * u64 tm_texasr; + * u64 tm_tfiar; + * }; + */ +static int tm_spr_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + /* Build tests */ + BUILD_BUG_ON(TSO(tm_tfhar) + sizeof(u64) != TSO(tm_texasr)); + BUILD_BUG_ON(TSO(tm_texasr) + sizeof(u64) != TSO(tm_tfiar)); + BUILD_BUG_ON(TSO(tm_tfiar) + sizeof(u64) != TSO(ckpt_regs)); + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + /* Flush the states */ + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + flush_tmregs_to_thread(target); + + /* TFHAR register */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.tm_tfhar, 0, sizeof(u64)); + + /* TEXASR register */ + if (!ret) + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.tm_texasr, sizeof(u64), + 2 * sizeof(u64)); + + /* TFIAR register */ + if (!ret) + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.tm_tfiar, + 2 * sizeof(u64), 3 * sizeof(u64)); + return ret; +} + +/** + * tm_spr_set - set the TM related SPR registers + * @target: The target task. + * @regset: The user regset structure. + * @pos: The buffer position. + * @count: Number of bytes to copy. + * @kbuf: Kernel buffer to copy into. + * @ubuf: User buffer to copy from. + * + * This function sets transactional memory related SPR registers. + * The userspace interface buffer layout is as follows. + * + * struct { + * u64 tm_tfhar; + * u64 tm_texasr; + * u64 tm_tfiar; + * }; + */ +static int tm_spr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + /* Build tests */ + BUILD_BUG_ON(TSO(tm_tfhar) + sizeof(u64) != TSO(tm_texasr)); + BUILD_BUG_ON(TSO(tm_texasr) + sizeof(u64) != TSO(tm_tfiar)); + BUILD_BUG_ON(TSO(tm_tfiar) + sizeof(u64) != TSO(ckpt_regs)); + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + /* Flush the states */ + flush_fp_to_thread(target); + flush_altivec_to_thread(target); + flush_tmregs_to_thread(target); + + /* TFHAR register */ + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.tm_tfhar, 0, sizeof(u64)); + + /* TEXASR register */ + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.tm_texasr, sizeof(u64), + 2 * sizeof(u64)); + + /* TFIAR register */ + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.tm_tfiar, + 2 * sizeof(u64), 3 * sizeof(u64)); + return ret; +} + +static int tm_tar_active(struct task_struct *target, + const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (MSR_TM_ACTIVE(target->thread.regs->msr)) + return regset->n; + + return 0; +} + +static int tm_tar_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.tm_tar, 0, sizeof(u64)); + return ret; +} + +static int tm_tar_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.tm_tar, 0, sizeof(u64)); + return ret; +} + +static int tm_ppr_active(struct task_struct *target, + const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (MSR_TM_ACTIVE(target->thread.regs->msr)) + return regset->n; + + return 0; +} + + +static int tm_ppr_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.tm_ppr, 0, sizeof(u64)); + return ret; +} + +static int tm_ppr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.tm_ppr, 0, sizeof(u64)); + return ret; +} + +static int tm_dscr_active(struct task_struct *target, + const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (MSR_TM_ACTIVE(target->thread.regs->msr)) + return regset->n; + + return 0; +} + +static int tm_dscr_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.tm_dscr, 0, sizeof(u64)); + return ret; +} + +static int tm_dscr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + if (!cpu_has_feature(CPU_FTR_TM)) + return -ENODEV; + + if (!MSR_TM_ACTIVE(target->thread.regs->msr)) + return -ENODATA; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.tm_dscr, 0, sizeof(u64)); + return ret; +} +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + +#ifdef CONFIG_PPC64 +static int ppr_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.ppr, 0, sizeof(u64)); + return ret; +} + +static int ppr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.ppr, 0, sizeof(u64)); + return ret; +} + +static int dscr_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.dscr, 0, sizeof(u64)); + return ret; +} +static int dscr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.dscr, 0, sizeof(u64)); + return ret; +} +#endif +#ifdef CONFIG_PPC_BOOK3S_64 +static int tar_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.tar, 0, sizeof(u64)); + return ret; +} +static int tar_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.tar, 0, sizeof(u64)); + return ret; +} + +static int ebb_active(struct task_struct *target, + const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return -ENODEV; + + if (target->thread.used_ebb) + return regset->n; + + return 0; +} + +static int ebb_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + /* Build tests */ + BUILD_BUG_ON(TSO(ebbrr) + sizeof(unsigned long) != TSO(ebbhr)); + BUILD_BUG_ON(TSO(ebbhr) + sizeof(unsigned long) != TSO(bescr)); + + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return -ENODEV; + + if (!target->thread.used_ebb) + return -ENODATA; + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.ebbrr, 0, 3 * sizeof(unsigned long)); +} + +static int ebb_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret = 0; + + /* Build tests */ + BUILD_BUG_ON(TSO(ebbrr) + sizeof(unsigned long) != TSO(ebbhr)); + BUILD_BUG_ON(TSO(ebbhr) + sizeof(unsigned long) != TSO(bescr)); + + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return -ENODEV; + + if (target->thread.used_ebb) + return -ENODATA; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.ebbrr, 0, sizeof(unsigned long)); + + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.ebbhr, sizeof(unsigned long), + 2 * sizeof(unsigned long)); + + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.bescr, + 2 * sizeof(unsigned long), 3 * sizeof(unsigned long)); + + return ret; +} +static int pmu_active(struct task_struct *target, + const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return -ENODEV; + + return regset->n; +} + +static int pmu_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + /* Build tests */ + BUILD_BUG_ON(TSO(siar) + sizeof(unsigned long) != TSO(sdar)); + BUILD_BUG_ON(TSO(sdar) + sizeof(unsigned long) != TSO(sier)); + BUILD_BUG_ON(TSO(sier) + sizeof(unsigned long) != TSO(mmcr2)); + BUILD_BUG_ON(TSO(mmcr2) + sizeof(unsigned long) != TSO(mmcr0)); + + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return -ENODEV; + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.siar, 0, + 5 * sizeof(unsigned long)); +} + +static int pmu_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret = 0; + + /* Build tests */ + BUILD_BUG_ON(TSO(siar) + sizeof(unsigned long) != TSO(sdar)); + BUILD_BUG_ON(TSO(sdar) + sizeof(unsigned long) != TSO(sier)); + BUILD_BUG_ON(TSO(sier) + sizeof(unsigned long) != TSO(mmcr2)); + BUILD_BUG_ON(TSO(mmcr2) + sizeof(unsigned long) != TSO(mmcr0)); + + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return -ENODEV; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.siar, 0, + sizeof(unsigned long)); + + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.sdar, sizeof(unsigned long), + 2 * sizeof(unsigned long)); + + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.sier, 2 * sizeof(unsigned long), + 3 * sizeof(unsigned long)); + + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.mmcr2, 3 * sizeof(unsigned long), + 4 * sizeof(unsigned long)); + + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.mmcr0, 4 * sizeof(unsigned long), + 5 * sizeof(unsigned long)); + return ret; +} +#endif +/* * These are our native regset flavors. */ enum powerpc_regset { @@ -630,6 +1909,25 @@ enum powerpc_regset { #ifdef CONFIG_SPE REGSET_SPE, #endif +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + REGSET_TM_CGPR, /* TM checkpointed GPR registers */ + REGSET_TM_CFPR, /* TM checkpointed FPR registers */ + REGSET_TM_CVMX, /* TM checkpointed VMX registers */ + REGSET_TM_CVSX, /* TM checkpointed VSX registers */ + REGSET_TM_SPR, /* TM specific SPR registers */ + REGSET_TM_CTAR, /* TM checkpointed TAR register */ + REGSET_TM_CPPR, /* TM checkpointed PPR register */ + REGSET_TM_CDSCR, /* TM checkpointed DSCR register */ +#endif +#ifdef CONFIG_PPC64 + REGSET_PPR, /* PPR register */ + REGSET_DSCR, /* DSCR register */ +#endif +#ifdef CONFIG_PPC_BOOK3S_64 + REGSET_TAR, /* TAR register */ + REGSET_EBB, /* EBB registers */ + REGSET_PMR, /* Performance Monitor Registers */ +#endif }; static const struct user_regset native_regsets[] = { @@ -664,6 +1962,77 @@ static const struct user_regset native_regsets[] = { .active = evr_active, .get = evr_get, .set = evr_set }, #endif +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + [REGSET_TM_CGPR] = { + .core_note_type = NT_PPC_TM_CGPR, .n = ELF_NGREG, + .size = sizeof(long), .align = sizeof(long), + .active = tm_cgpr_active, .get = tm_cgpr_get, .set = tm_cgpr_set + }, + [REGSET_TM_CFPR] = { + .core_note_type = NT_PPC_TM_CFPR, .n = ELF_NFPREG, + .size = sizeof(double), .align = sizeof(double), + .active = tm_cfpr_active, .get = tm_cfpr_get, .set = tm_cfpr_set + }, + [REGSET_TM_CVMX] = { + .core_note_type = NT_PPC_TM_CVMX, .n = ELF_NVMX, + .size = sizeof(vector128), .align = sizeof(vector128), + .active = tm_cvmx_active, .get = tm_cvmx_get, .set = tm_cvmx_set + }, + [REGSET_TM_CVSX] = { + .core_note_type = NT_PPC_TM_CVSX, .n = ELF_NVSX, + .size = sizeof(double), .align = sizeof(double), + .active = tm_cvsx_active, .get = tm_cvsx_get, .set = tm_cvsx_set + }, + [REGSET_TM_SPR] = { + .core_note_type = NT_PPC_TM_SPR, .n = ELF_NTMSPRREG, + .size = sizeof(u64), .align = sizeof(u64), + .active = tm_spr_active, .get = tm_spr_get, .set = tm_spr_set + }, + [REGSET_TM_CTAR] = { + .core_note_type = NT_PPC_TM_CTAR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .active = tm_tar_active, .get = tm_tar_get, .set = tm_tar_set + }, + [REGSET_TM_CPPR] = { + .core_note_type = NT_PPC_TM_CPPR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .active = tm_ppr_active, .get = tm_ppr_get, .set = tm_ppr_set + }, + [REGSET_TM_CDSCR] = { + .core_note_type = NT_PPC_TM_CDSCR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .active = tm_dscr_active, .get = tm_dscr_get, .set = tm_dscr_set + }, +#endif +#ifdef CONFIG_PPC64 + [REGSET_PPR] = { + .core_note_type = NT_PPC_PPR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .get = ppr_get, .set = ppr_set + }, + [REGSET_DSCR] = { + .core_note_type = NT_PPC_DSCR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .get = dscr_get, .set = dscr_set + }, +#endif +#ifdef CONFIG_PPC_BOOK3S_64 + [REGSET_TAR] = { + .core_note_type = NT_PPC_TAR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .get = tar_get, .set = tar_set + }, + [REGSET_EBB] = { + .core_note_type = NT_PPC_EBB, .n = ELF_NEBB, + .size = sizeof(u64), .align = sizeof(u64), + .active = ebb_active, .get = ebb_get, .set = ebb_set + }, + [REGSET_PMR] = { + .core_note_type = NT_PPC_PMU, .n = ELF_NPMU, + .size = sizeof(u64), .align = sizeof(u64), + .active = pmu_active, .get = pmu_get, .set = pmu_set + }, +#endif }; static const struct user_regset_view user_ppc_native_view = { @@ -674,24 +2043,35 @@ static const struct user_regset_view user_ppc_native_view = { #ifdef CONFIG_PPC64 #include <linux/compat.h> -static int gpr32_get(struct task_struct *target, +static int gpr32_get_common(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + void *kbuf, void __user *ubuf, bool tm_active) { const unsigned long *regs = &target->thread.regs->gpr[0]; + const unsigned long *ckpt_regs; compat_ulong_t *k = kbuf; compat_ulong_t __user *u = ubuf; compat_ulong_t reg; int i; - if (target->thread.regs == NULL) - return -EIO; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + ckpt_regs = &target->thread.ckpt_regs.gpr[0]; +#endif + if (tm_active) { + regs = ckpt_regs; + } else { + if (target->thread.regs == NULL) + return -EIO; - if (!FULL_REGS(target->thread.regs)) { - /* We have a partial register set. Fill 14-31 with bogus values */ - for (i = 14; i < 32; i++) - target->thread.regs->gpr[i] = NV_REG_POISON; + if (!FULL_REGS(target->thread.regs)) { + /* + * We have a partial register set. + * Fill 14-31 with bogus values. + */ + for (i = 14; i < 32; i++) + target->thread.regs->gpr[i] = NV_REG_POISON; + } } pos /= sizeof(reg); @@ -731,20 +2111,31 @@ static int gpr32_get(struct task_struct *target, PT_REGS_COUNT * sizeof(reg), -1); } -static int gpr32_set(struct task_struct *target, +static int gpr32_set_common(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) + const void *kbuf, const void __user *ubuf, bool tm_active) { unsigned long *regs = &target->thread.regs->gpr[0]; + unsigned long *ckpt_regs; const compat_ulong_t *k = kbuf; const compat_ulong_t __user *u = ubuf; compat_ulong_t reg; - if (target->thread.regs == NULL) - return -EIO; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + ckpt_regs = &target->thread.ckpt_regs.gpr[0]; +#endif - CHECK_FULL_REGS(target->thread.regs); + if (tm_active) { + regs = ckpt_regs; + } else { + regs = &target->thread.regs->gpr[0]; + + if (target->thread.regs == NULL) + return -EIO; + + CHECK_FULL_REGS(target->thread.regs); + } pos /= sizeof(reg); count /= sizeof(reg); @@ -804,6 +2195,40 @@ static int gpr32_set(struct task_struct *target, (PT_TRAP + 1) * sizeof(reg), -1); } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +static int tm_cgpr32_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + return gpr32_get_common(target, regset, pos, count, kbuf, ubuf, 1); +} + +static int tm_cgpr32_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + return gpr32_set_common(target, regset, pos, count, kbuf, ubuf, 1); +} +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + +static int gpr32_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + return gpr32_get_common(target, regset, pos, count, kbuf, ubuf, 0); +} + +static int gpr32_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + return gpr32_set_common(target, regset, pos, count, kbuf, ubuf, 0); +} + /* * These are the regset flavors matching the CONFIG_PPC32 native set. */ @@ -832,6 +2257,73 @@ static const struct user_regset compat_regsets[] = { .active = evr_active, .get = evr_get, .set = evr_set }, #endif +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + [REGSET_TM_CGPR] = { + .core_note_type = NT_PPC_TM_CGPR, .n = ELF_NGREG, + .size = sizeof(long), .align = sizeof(long), + .active = tm_cgpr_active, + .get = tm_cgpr32_get, .set = tm_cgpr32_set + }, + [REGSET_TM_CFPR] = { + .core_note_type = NT_PPC_TM_CFPR, .n = ELF_NFPREG, + .size = sizeof(double), .align = sizeof(double), + .active = tm_cfpr_active, .get = tm_cfpr_get, .set = tm_cfpr_set + }, + [REGSET_TM_CVMX] = { + .core_note_type = NT_PPC_TM_CVMX, .n = ELF_NVMX, + .size = sizeof(vector128), .align = sizeof(vector128), + .active = tm_cvmx_active, .get = tm_cvmx_get, .set = tm_cvmx_set + }, + [REGSET_TM_CVSX] = { + .core_note_type = NT_PPC_TM_CVSX, .n = ELF_NVSX, + .size = sizeof(double), .align = sizeof(double), + .active = tm_cvsx_active, .get = tm_cvsx_get, .set = tm_cvsx_set + }, + [REGSET_TM_SPR] = { + .core_note_type = NT_PPC_TM_SPR, .n = ELF_NTMSPRREG, + .size = sizeof(u64), .align = sizeof(u64), + .active = tm_spr_active, .get = tm_spr_get, .set = tm_spr_set + }, + [REGSET_TM_CTAR] = { + .core_note_type = NT_PPC_TM_CTAR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .active = tm_tar_active, .get = tm_tar_get, .set = tm_tar_set + }, + [REGSET_TM_CPPR] = { + .core_note_type = NT_PPC_TM_CPPR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .active = tm_ppr_active, .get = tm_ppr_get, .set = tm_ppr_set + }, + [REGSET_TM_CDSCR] = { + .core_note_type = NT_PPC_TM_CDSCR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .active = tm_dscr_active, .get = tm_dscr_get, .set = tm_dscr_set + }, +#endif +#ifdef CONFIG_PPC64 + [REGSET_PPR] = { + .core_note_type = NT_PPC_PPR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .get = ppr_get, .set = ppr_set + }, + [REGSET_DSCR] = { + .core_note_type = NT_PPC_DSCR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .get = dscr_get, .set = dscr_set + }, +#endif +#ifdef CONFIG_PPC_BOOK3S_64 + [REGSET_TAR] = { + .core_note_type = NT_PPC_TAR, .n = 1, + .size = sizeof(u64), .align = sizeof(u64), + .get = tar_get, .set = tar_set + }, + [REGSET_EBB] = { + .core_note_type = NT_PPC_EBB, .n = ELF_NEBB, + .size = sizeof(u64), .align = sizeof(u64), + .active = ebb_active, .get = ebb_get, .set = ebb_set + }, +#endif }; static const struct user_regset_view user_ppc_compat_view = { diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 714b4ba7ab86..dba265c586df 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -66,6 +66,7 @@ #include <asm/hugetlb.h> #include <asm/livepatch.h> #include <asm/mmu_context.h> +#include <asm/cpu_has_feature.h> #include "setup.h" diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 00f57754407e..c3e861df4b20 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -37,6 +37,7 @@ #include <asm/serial.h> #include <asm/udbg.h> #include <asm/code-patching.h> +#include <asm/cpu_has_feature.h> #define DBG(fmt...) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index d8216aed22b7..eafb9a79e011 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -227,8 +227,8 @@ static void __init configure_exceptions(void) opal_configure_cores(); /* Enable AIL if supported, and we are in hypervisor mode */ - if (cpu_has_feature(CPU_FTR_HVMODE) && - cpu_has_feature(CPU_FTR_ARCH_207S)) { + if (early_cpu_has_feature(CPU_FTR_HVMODE) && + early_cpu_has_feature(CPU_FTR_ARCH_207S)) { unsigned long lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3); } @@ -298,12 +298,12 @@ void __init early_setup(unsigned long dt_ptr) */ configure_exceptions(); - /* Initialize the hash table or TLB handling */ - early_init_mmu(); - /* Apply all the dynamic patching */ apply_feature_fixups(); + /* Initialize the hash table or TLB handling */ + early_init_mmu(); + /* * At this point, we can let interrupts switch to virtual mode * (the MMU has been setup), so adjust the MSR in the PACA to diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 5a1f015ea9f3..25a39052bf6b 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -55,6 +55,7 @@ #include <asm/debug.h> #include <asm/kexec.h> #include <asm/asm-prototypes.h> +#include <asm/cpu_has_feature.h> #ifdef DEBUG #include <asm/udbg.h> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 4e7759c8ca30..3efbedefba6a 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -56,6 +56,7 @@ #include <linux/irq_work.h> #include <linux/clk-provider.h> #include <linux/suspend.h> +#include <linux/rtc.h> #include <asm/trace.h> #include <asm/io.h> @@ -1159,6 +1160,29 @@ void calibrate_delay(void) loops_per_jiffy = tb_ticks_per_jiffy; } +#if IS_ENABLED(CONFIG_RTC_DRV_GENERIC) +static int rtc_generic_get_time(struct device *dev, struct rtc_time *tm) +{ + ppc_md.get_rtc_time(tm); + return rtc_valid_tm(tm); +} + +static int rtc_generic_set_time(struct device *dev, struct rtc_time *tm) +{ + if (!ppc_md.set_rtc_time) + return -EOPNOTSUPP; + + if (ppc_md.set_rtc_time(tm) < 0) + return -EOPNOTSUPP; + + return 0; +} + +static const struct rtc_class_ops rtc_generic_ops = { + .read_time = rtc_generic_get_time, + .set_time = rtc_generic_set_time, +}; + static int __init rtc_init(void) { struct platform_device *pdev; @@ -1166,9 +1190,12 @@ static int __init rtc_init(void) if (!ppc_md.get_rtc_time) return -ENODEV; - pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0); + pdev = platform_device_register_data(NULL, "rtc-generic", -1, + &rtc_generic_ops, + sizeof(rtc_generic_ops)); return PTR_ERR_OR_ZERO(pdev); } device_initcall(rtc_init); +#endif diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index defb2998b818..74145f02ad41 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -13,6 +13,7 @@ */ #include <linux/types.h> +#include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/init.h> @@ -152,9 +153,18 @@ static void do_final_fixups(void) #endif } -void apply_feature_fixups(void) +static unsigned long __initdata saved_cpu_features; +static unsigned int __initdata saved_mmu_features; +#ifdef CONFIG_PPC64 +static unsigned long __initdata saved_firmware_features; +#endif + +void __init apply_feature_fixups(void) { - struct cpu_spec *spec = *PTRRELOC(&cur_cpu_spec); + struct cpu_spec *spec = PTRRELOC(*PTRRELOC(&cur_cpu_spec)); + + *PTRRELOC(&saved_cpu_features) = spec->cpu_features; + *PTRRELOC(&saved_mmu_features) = spec->mmu_features; /* * Apply the CPU-specific and firmware specific fixups to kernel text @@ -173,11 +183,36 @@ void apply_feature_fixups(void) PTRRELOC(&__stop___lwsync_fixup)); #ifdef CONFIG_PPC64 + saved_firmware_features = powerpc_firmware_features; do_feature_fixups(powerpc_firmware_features, &__start___fw_ftr_fixup, &__stop___fw_ftr_fixup); #endif do_final_fixups(); + + /* + * Initialise jump label. This causes all the cpu/mmu_has_feature() + * checks to take on their correct polarity based on the current set of + * CPU/MMU features. + */ + jump_label_init(); + cpu_feature_keys_init(); + mmu_feature_keys_init(); +} + +static int __init check_features(void) +{ + WARN(saved_cpu_features != cur_cpu_spec->cpu_features, + "CPU features changed after feature patching!\n"); + WARN(saved_mmu_features != cur_cpu_spec->mmu_features, + "MMU features changed after feature patching!\n"); +#ifdef CONFIG_PPC64 + WARN(saved_firmware_features != powerpc_firmware_features, + "Firmware features changed after feature patching!\n"); +#endif + + return 0; } +late_initcall(check_features); #ifdef CONFIG_FTR_FIXUP_SELFTEST diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 88ce7d212320..0e4e9654bd2c 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -72,8 +72,7 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize) /* clear out bits after (52) [0....52.....63] */ va &= ~((1ul << (64 - 52)) - 1); va |= ssize << 8; - sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) | - ((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4); + sllp = get_sllp_encoding(apsize); va |= sllp << 5; asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2) : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) @@ -122,8 +121,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) /* clear out bits after(52) [0....52.....63] */ va &= ~((1ul << (64 - 52)) - 1); va |= ssize << 8; - sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) | - ((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4); + sllp = get_sllp_encoding(apsize); va |= sllp << 5; asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)" : : "r"(va) : "memory"); @@ -749,5 +747,5 @@ void __init hpte_init_native(void) mmu_hash_ops.hugepage_invalidate = native_hugepage_invalidate; if (cpu_has_feature(CPU_FTR_ARCH_300)) - ppc_md.register_process_table = native_register_proc_table; + register_process_table = native_register_proc_table; } diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index b78b5d211278..0821556e16f4 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -363,11 +363,6 @@ static int __init htab_dt_scan_seg_sizes(unsigned long node, return 0; } -static void __init htab_init_seg_sizes(void) -{ - of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL); -} - static int __init get_idx_from_shift(unsigned int shift) { int idx = -1; @@ -539,7 +534,7 @@ static bool might_have_hea(void) #endif /* #ifdef CONFIG_PPC_64K_PAGES */ -static void __init htab_init_page_sizes(void) +static void __init htab_scan_page_sizes(void) { int rc; @@ -554,17 +549,23 @@ static void __init htab_init_page_sizes(void) * Try to find the available page sizes in the device-tree */ rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL); - if (rc != 0) /* Found */ - goto found; - - /* - * Not in the device-tree, let's fallback on known size - * list for 16M capable GP & GR - */ - if (mmu_has_feature(MMU_FTR_16M_PAGE)) + if (rc == 0 && early_mmu_has_feature(MMU_FTR_16M_PAGE)) { + /* + * Nothing in the device-tree, but the CPU supports 16M pages, + * so let's fallback on a known size list for 16M capable CPUs. + */ memcpy(mmu_psize_defs, mmu_psize_defaults_gp, sizeof(mmu_psize_defaults_gp)); -found: + } + +#ifdef CONFIG_HUGETLB_PAGE + /* Reserve 16G huge page memory sections for huge pages */ + of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); +#endif /* CONFIG_HUGETLB_PAGE */ +} + +static void __init htab_init_page_sizes(void) +{ if (!debug_pagealloc_enabled()) { /* * Pick a size for the linear mapping. Currently, we only @@ -630,11 +631,6 @@ found: ,mmu_psize_defs[mmu_vmemmap_psize].shift #endif ); - -#ifdef CONFIG_HUGETLB_PAGE - /* Reserve 16G huge page memory sections for huge pages */ - of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); -#endif /* CONFIG_HUGETLB_PAGE */ } static int __init htab_dt_scan_pftsize(unsigned long node, @@ -759,12 +755,6 @@ static void __init htab_initialize(void) DBG(" -> htab_initialize()\n"); - /* Initialize segment sizes */ - htab_init_seg_sizes(); - - /* Initialize page sizes */ - htab_init_page_sizes(); - if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { mmu_kernel_ssize = MMU_SEGSIZE_1T; mmu_highuser_ssize = MMU_SEGSIZE_1T; @@ -885,8 +875,19 @@ static void __init htab_initialize(void) #undef KB #undef MB +void __init hash__early_init_devtree(void) +{ + /* Initialize segment sizes */ + of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL); + + /* Initialize page sizes */ + htab_scan_page_sizes(); +} + void __init hash__early_init_mmu(void) { + htab_init_page_sizes(); + /* * initialize page table size */ diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c index 1e11559e1aac..35254a678456 100644 --- a/arch/powerpc/mm/hugetlbpage-radix.c +++ b/arch/powerpc/mm/hugetlbpage-radix.c @@ -5,39 +5,34 @@ #include <asm/cacheflush.h> #include <asm/machdep.h> #include <asm/mman.h> +#include <asm/tlb.h> void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { - unsigned long ap, shift; + int psize; struct hstate *hstate = hstate_file(vma->vm_file); - shift = huge_page_shift(hstate); - if (shift == mmu_psize_defs[MMU_PAGE_2M].shift) - ap = mmu_get_ap(MMU_PAGE_2M); - else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift) - ap = mmu_get_ap(MMU_PAGE_1G); - else { - WARN(1, "Wrong huge page shift\n"); - return ; - } - radix___flush_tlb_page(vma->vm_mm, vmaddr, ap, 0); + psize = hstate_get_psize(hstate); + radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, psize); } void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { - unsigned long ap, shift; + int psize; struct hstate *hstate = hstate_file(vma->vm_file); - shift = huge_page_shift(hstate); - if (shift == mmu_psize_defs[MMU_PAGE_2M].shift) - ap = mmu_get_ap(MMU_PAGE_2M); - else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift) - ap = mmu_get_ap(MMU_PAGE_1G); - else { - WARN(1, "Wrong huge page shift\n"); - return ; - } - radix___local_flush_tlb_page(vma->vm_mm, vmaddr, ap, 0); + psize = hstate_get_psize(hstate); + radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, psize); +} + +void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + int psize; + struct hstate *hstate = hstate_file(vma->vm_file); + + psize = hstate_get_psize(hstate); + radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); } /* diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 33709bdb0419..16ada1eb7e26 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -411,3 +411,25 @@ struct page *realmode_pfn_to_page(unsigned long pfn) EXPORT_SYMBOL_GPL(realmode_pfn_to_page); #endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */ + +#ifdef CONFIG_PPC_STD_MMU_64 +static bool disable_radix; +static int __init parse_disable_radix(char *p) +{ + disable_radix = true; + return 0; +} +early_param("disable_radix", parse_disable_radix); + +void __init mmu_early_init_devtree(void) +{ + /* Disable radix mode based on kernel command line. */ + if (disable_radix) + cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; + + if (early_radix_enabled()) + radix__early_init_devtree(); + else + hash__early_init_devtree(); +} +#endif /* CONFIG_PPC_STD_MMU_64 */ diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index 670318766545..34079302cc17 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -14,6 +14,9 @@ #include "mmu_decl.h" #include <trace/events/thp.h> +int (*register_process_table)(unsigned long base, unsigned long page_size, + unsigned long tbl_size); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is called when relaxing access to a hugepage. It's also called in the page @@ -33,7 +36,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, changed = !pmd_same(*(pmdp), entry); if (changed) { __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } return changed; } @@ -66,7 +69,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); /* * This ensures that generic code that rely on IRQ disabling * to prevent a parallel THP split work as expected. diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 003ff48a11b6..af897d91d09f 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -171,7 +171,7 @@ redo: * of process table here. But our linear mapping also enable us to use * physical address here. */ - ppc_md.register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12); + register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12); pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd); } @@ -198,7 +198,7 @@ static void __init radix_init_partition_table(void) void __init radix_init_native(void) { - ppc_md.register_process_table = native_register_process_table; + register_process_table = native_register_process_table; } static int __init get_idx_from_shift(unsigned int shift) @@ -264,7 +264,7 @@ static int __init radix_dt_scan_page_sizes(unsigned long node, return 1; } -static void __init radix_init_page_sizes(void) +void __init radix__early_init_devtree(void) { int rc; @@ -343,7 +343,6 @@ void __init radix__early_init_mmu(void) __pte_frag_nr = H_PTE_FRAG_NR; __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; - radix_init_page_sizes(); if (!firmware_has_feature(FW_FEATURE_LPAR)) { radix_init_native(); lpcr = mfspr(SPRN_LPCR); diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 88a307504b5a..0b6fb244d0a1 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -225,7 +225,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, if (!is_vm_hugetlb_page(vma)) assert_pte_locked(vma->vm_mm, address); __ptep_set_access_flags(ptep, entry); - flush_tlb_page_nohash(vma, address); + flush_tlb_page(vma, address); } return changed; } diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index e1f22700fb16..48df05ef5231 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -140,10 +140,11 @@ void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) } EXPORT_SYMBOL(radix__local_flush_tlb_pwc); -void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, - unsigned long ap, int nid) +void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, + int psize) { unsigned long pid; + unsigned long ap = mmu_get_ap(psize); preempt_disable(); pid = mm ? mm->context.id : 0; @@ -159,18 +160,12 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd if (vma && is_vm_hugetlb_page(vma)) return __local_flush_hugetlb_page(vma, vmaddr); #endif - radix___local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, - mmu_get_ap(mmu_virtual_psize), 0); + radix__local_flush_tlb_page_psize(vma ? vma->vm_mm : NULL, vmaddr, + mmu_virtual_psize); } EXPORT_SYMBOL(radix__local_flush_tlb_page); #ifdef CONFIG_SMP -static int mm_is_core_local(struct mm_struct *mm) -{ - return cpumask_subset(mm_cpumask(mm), - topology_sibling_cpumask(smp_processor_id())); -} - void radix__flush_tlb_mm(struct mm_struct *mm) { unsigned long pid; @@ -221,10 +216,11 @@ no_context: } EXPORT_SYMBOL(radix__flush_tlb_pwc); -void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, - unsigned long ap, int nid) +void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, + int psize) { unsigned long pid; + unsigned long ap = mmu_get_ap(psize); preempt_disable(); pid = mm ? mm->context.id : 0; @@ -250,8 +246,8 @@ void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) if (vma && is_vm_hugetlb_page(vma)) return flush_hugetlb_page(vma, vmaddr); #endif - radix___flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, - mmu_get_ap(mmu_virtual_psize), 0); + radix__flush_tlb_page_psize(vma ? vma->vm_mm : NULL, vmaddr, + mmu_virtual_psize); } EXPORT_SYMBOL(radix__flush_tlb_page); @@ -299,8 +295,65 @@ static int radix_get_mmu_psize(int page_size) void radix__tlb_flush(struct mmu_gather *tlb) { + int psize = 0; struct mm_struct *mm = tlb->mm; - radix__flush_tlb_mm(mm); + int page_size = tlb->page_size; + + psize = radix_get_mmu_psize(page_size); + /* + * if page size is not something we understand, do a full mm flush + */ + if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all) + radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize); + else + radix__flush_tlb_mm(mm); +} + +#define TLB_FLUSH_ALL -1UL +/* + * Number of pages above which we will do a bcast tlbie. Just a + * number at this point copied from x86 + */ +static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; + +void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long end, int psize) +{ + unsigned long pid; + unsigned long addr; + int local = mm_is_core_local(mm); + unsigned long ap = mmu_get_ap(psize); + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + unsigned long page_size = 1UL << mmu_psize_defs[psize].shift; + + + preempt_disable(); + pid = mm ? mm->context.id : 0; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto err_out; + + if (end == TLB_FLUSH_ALL || + (end - start) > tlb_single_page_flush_ceiling * page_size) { + if (local) + _tlbiel_pid(pid, RIC_FLUSH_TLB); + else + _tlbie_pid(pid, RIC_FLUSH_TLB); + goto err_out; + } + for (addr = start; addr < end; addr += page_size) { + + if (local) + _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); + else { + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); + } + } +err_out: + preempt_enable(); } void radix__flush_tlb_lpid_va(unsigned long lpid, unsigned long gpa, @@ -340,3 +393,10 @@ void radix__flush_tlb_lpid(unsigned long lpid) asm volatile("eieio; tlbsync; ptesync": : :"memory"); } EXPORT_SYMBOL(radix__flush_tlb_lpid); + +void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_2M); +} +EXPORT_SYMBOL(radix__flush_pmd_tlb_range); diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c index 558e30cce33e..702d7689d714 100644 --- a/arch/powerpc/mm/tlb_hash32.c +++ b/arch/powerpc/mm/tlb_hash32.c @@ -49,17 +49,6 @@ void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr) EXPORT_SYMBOL(flush_hash_entry); /* - * Called by ptep_set_access_flags, must flush on CPUs for which the - * DSI handler can't just "fixup" the TLB on a write fault - */ -void flush_tlb_page_nohash(struct vm_area_struct *vma, unsigned long addr) -{ - if (Hash != 0) - return; - _tlbie(addr); -} - -/* * Called at the end of a mmu_gather operation to make sure the * TLB flush is completely done. */ diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index f4668488512c..050badc0ebd3 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -215,12 +215,6 @@ EXPORT_SYMBOL(local_flush_tlb_page); static DEFINE_RAW_SPINLOCK(tlbivax_lock); -static int mm_is_core_local(struct mm_struct *mm) -{ - return cpumask_subset(mm_cpumask(mm), - topology_sibling_cpumask(smp_processor_id())); -} - struct tlb_flush_param { unsigned long addr; unsigned int pid; diff --git a/arch/powerpc/perf/power9-events-list.h b/arch/powerpc/perf/power9-events-list.h index cda6fcb809ca..6447dc1c3d89 100644 --- a/arch/powerpc/perf/power9-events-list.h +++ b/arch/powerpc/perf/power9-events-list.h @@ -34,15 +34,15 @@ EVENT(PM_L1_ICACHE_MISS, 0x200fd) /* Instruction Demand sectors wriittent into IL1 */ EVENT(PM_L1_DEMAND_WRITE, 0x0408c) /* Instruction prefetch written into IL1 */ -EVENT(PM_IC_PREF_WRITE, 0x0408e) +EVENT(PM_IC_PREF_WRITE, 0x0488c) /* The data cache was reloaded from local core's L3 due to a demand load */ EVENT(PM_DATA_FROM_L3, 0x4c042) /* Demand LD - L3 Miss (not L2 hit and not L3 hit) */ EVENT(PM_DATA_FROM_L3MISS, 0x300fe) /* All successful D-side store dispatches for this thread */ -EVENT(PM_L2_ST, 0x16081) +EVENT(PM_L2_ST, 0x16880) /* All successful D-side store dispatches for this thread that were L2 Miss */ -EVENT(PM_L2_ST_MISS, 0x26081) +EVENT(PM_L2_ST_MISS, 0x26880) /* Total HW L3 prefetches(Load+store) */ EVENT(PM_L3_PREF_ALL, 0x4e052) /* Data PTEG reload */ diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index 3663f71fd913..fbdae8377b71 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -321,6 +321,17 @@ config OF_RTC Uses information from the OF or flattened device tree to instantiate platform devices for direct mapped RTC chips like the DS1742 or DS1743. +config GEN_RTC + bool "Use the platform RTC operations from user space" + select RTC_CLASS + select RTC_DRV_GENERIC + help + This option provides backwards compatibility with the old gen_rtc.ko + module that was traditionally used for old PowerPC machines. + Platforms should migrate to enabling the RTC_DRV_GENERIC by hand + replacing their get_rtc_time/set_rtc_time callbacks with + a proper RTC device driver. + config SIMPLE_GPIO bool "Support for simple, memory-mapped GPIO controllers" depends on PPC diff --git a/arch/powerpc/platforms/cell/pervasive.c b/arch/powerpc/platforms/cell/pervasive.c index d17e98bc0c10..e7d075077cb0 100644 --- a/arch/powerpc/platforms/cell/pervasive.c +++ b/arch/powerpc/platforms/cell/pervasive.c @@ -35,6 +35,7 @@ #include <asm/pgtable.h> #include <asm/reg.h> #include <asm/cell-regs.h> +#include <asm/cpu_has_feature.h> #include "pervasive.h" diff --git a/arch/powerpc/platforms/ps3/time.c b/arch/powerpc/platforms/ps3/time.c index 791c6142c4a7..11b45b58c81b 100644 --- a/arch/powerpc/platforms/ps3/time.c +++ b/arch/powerpc/platforms/ps3/time.c @@ -20,9 +20,9 @@ #include <linux/kernel.h> #include <linux/platform_device.h> +#include <linux/rtc.h> #include <asm/firmware.h> -#include <asm/rtc.h> #include <asm/lv1call.h> #include <asm/ps3.h> diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c index 984e816f3faf..68e7c0dd2e45 100644 --- a/arch/powerpc/sysdev/fsl_rio.c +++ b/arch/powerpc/sysdev/fsl_rio.c @@ -491,6 +491,7 @@ int fsl_rio_setup(struct platform_device *dev) rmu_node = of_parse_phandle(dev->dev.of_node, "fsl,srio-rmu-handle", 0); if (!rmu_node) { dev_err(&dev->dev, "No valid fsl,srio-rmu-handle property\n"); + rc = -ENOENT; goto err_rmu; } rc = of_address_to_resource(rmu_node, 0, &rmu_regs); diff --git a/arch/powerpc/xmon/ppc-dis.c b/arch/powerpc/xmon/ppc-dis.c index 89098f320ad5..ee9891734149 100644 --- a/arch/powerpc/xmon/ppc-dis.c +++ b/arch/powerpc/xmon/ppc-dis.c @@ -20,6 +20,7 @@ along with this file; see the file COPYING. If not, write to the Free Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA. */ #include <asm/cputable.h> +#include <asm/cpu_has_feature.h> #include "nonstdio.h" #include "ansidecl.h" #include "ppc.h" diff --git a/arch/sh/include/asm/mc146818rtc.h b/arch/sh/include/asm/mc146818rtc.h deleted file mode 100644 index 0aee96a97330..000000000000 --- a/arch/sh/include/asm/mc146818rtc.h +++ /dev/null @@ -1,7 +0,0 @@ -/* - * Machine dependent access functions for RTC registers. - */ -#ifndef _ASM_MC146818RTC_H -#define _ASM_MC146818RTC_H - -#endif /* _ASM_MC146818RTC_H */ diff --git a/arch/sh/include/asm/rtc.h b/arch/sh/include/asm/rtc.h index 52b0c2dba979..f7b010d48af7 100644 --- a/arch/sh/include/asm/rtc.h +++ b/arch/sh/include/asm/rtc.h @@ -6,17 +6,6 @@ extern void (*board_time_init)(void); extern void (*rtc_sh_get_time)(struct timespec *); extern int (*rtc_sh_set_time)(const time_t); -/* some dummy definitions */ -#define RTC_BATT_BAD 0x100 /* battery bad */ -#define RTC_SQWE 0x08 /* enable square-wave output */ -#define RTC_DM_BINARY 0x04 /* all time/date values are BCD if clear */ -#define RTC_24H 0x02 /* 24 hour mode - else hours bit 7 means pm */ -#define RTC_DST_EN 0x01 /* auto switch DST - works f. USA only */ - -struct rtc_time; -unsigned int get_rtc_time(struct rtc_time *); -int set_rtc_time(struct rtc_time *); - #define RTC_CAP_4_DIGIT_YEAR (1 << 0) struct sh_rtc_platform_info { diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c index d6d0a986c6e9..a4a7862b489a 100644 --- a/arch/sh/kernel/time.c +++ b/arch/sh/kernel/time.c @@ -50,27 +50,31 @@ int update_persistent_clock(struct timespec now) } #endif -unsigned int get_rtc_time(struct rtc_time *tm) +static int rtc_generic_get_time(struct device *dev, struct rtc_time *tm) { - if (rtc_sh_get_time != null_rtc_get_time) { - struct timespec tv; + struct timespec tv; - rtc_sh_get_time(&tv); - rtc_time_to_tm(tv.tv_sec, tm); - } - - return RTC_24H; + rtc_sh_get_time(&tv); + rtc_time_to_tm(tv.tv_sec, tm); + return 0; } -EXPORT_SYMBOL(get_rtc_time); -int set_rtc_time(struct rtc_time *tm) +static int rtc_generic_set_time(struct device *dev, struct rtc_time *tm) { unsigned long secs; rtc_tm_to_time(tm, &secs); - return rtc_sh_set_time(secs); + if ((rtc_sh_set_time == null_rtc_set_time) || + (rtc_sh_set_time(secs) < 0)) + return -EOPNOTSUPP; + + return 0; } -EXPORT_SYMBOL(set_rtc_time); + +static const struct rtc_class_ops rtc_generic_ops = { + .read_time = rtc_generic_get_time, + .set_time = rtc_generic_set_time, +}; static int __init rtc_generic_init(void) { @@ -79,7 +83,10 @@ static int __init rtc_generic_init(void) if (rtc_sh_get_time == null_rtc_get_time) return -ENODEV; - pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0); + pdev = platform_device_register_data(NULL, "rtc-generic", -1, + &rtc_generic_ops, + sizeof(rtc_generic_ops)); + return PTR_ERR_OR_ZERO(pdev); } diff --git a/arch/sparc/include/asm/io_32.h b/arch/sparc/include/asm/io_32.h index 57f26c398dc9..4dd268a3a8b0 100644 --- a/arch/sparc/include/asm/io_32.h +++ b/arch/sparc/include/asm/io_32.h @@ -140,16 +140,6 @@ void ioport_unmap(void __iomem *); struct pci_dev; void pci_iounmap(struct pci_dev *dev, void __iomem *); - - -/* - * At the moment, we do not use CMOS_READ anywhere outside of rtc.c, - * so rtc_port is static in it. This should not change unless a new - * hardware pops up. - */ -#define RTC_PORT(x) (rtc_port + (x)) -#define RTC_ALWAYS_BCD 0 - static inline int sbus_can_dma_64bit(void) { return 0; /* actually, sparc_cpu_model==sun4d */ diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common index 58650d098fb4..fd443852103c 100644 --- a/arch/um/Kconfig.common +++ b/arch/um/Kconfig.common @@ -1,10 +1,12 @@ config UML bool default y + select ARCH_HAS_KCOV select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_SECCOMP_FILTER select HAVE_UID16 select HAVE_FUTEX_CMPXCHG if FUTEX + select HAVE_DEBUG_KMEMLEAK select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES select GENERIC_IO @@ -31,10 +33,9 @@ config PCI config PCMCIA bool -# Yet to do! config TRACE_IRQFLAGS_SUPPORT bool - default n + default y config LOCKDEP_SUPPORT bool diff --git a/arch/um/include/asm/irqflags.h b/arch/um/include/asm/irqflags.h index c780d8a16773..3bb221e1d5a4 100644 --- a/arch/um/include/asm/irqflags.h +++ b/arch/um/include/asm/irqflags.h @@ -6,37 +6,33 @@ extern int set_signals(int enable); extern void block_signals(void); extern void unblock_signals(void); +#define arch_local_save_flags arch_local_save_flags static inline unsigned long arch_local_save_flags(void) { return get_signals(); } +#define arch_local_irq_restore arch_local_irq_restore static inline void arch_local_irq_restore(unsigned long flags) { set_signals(flags); } +#define arch_local_irq_enable arch_local_irq_enable static inline void arch_local_irq_enable(void) { unblock_signals(); } +#define arch_local_irq_disable arch_local_irq_disable static inline void arch_local_irq_disable(void) { block_signals(); } -static inline unsigned long arch_local_irq_save(void) -{ - unsigned long flags; - flags = arch_local_save_flags(); - arch_local_irq_disable(); - return flags; -} +#define ARCH_IRQ_DISABLED 0 +#define ARCh_IRQ_ENABLED (SIGIO|SIGVTALRM) -static inline bool arch_irqs_disabled(void) -{ - return arch_local_save_flags() == 0; -} +#include <asm-generic/irqflags.h> #endif diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index a6a5e42caaef..2f36d515762e 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -3,6 +3,11 @@ # Licensed under the GPL # +# Don't instrument UML-specific code; without this, we may crash when +# accessing the instrumentation buffer for the first time from the +# kernel. +KCOV_INSTRUMENT := n + CPPFLAGS_vmlinux.lds := -DSTART=$(LDS_START) \ -DELF_ARCH=$(LDS_ELF_ARCH) \ -DELF_FORMAT=$(LDS_ELF_FORMAT) \ diff --git a/arch/um/kernel/initrd.c b/arch/um/kernel/initrd.c index 55cead809b18..48bae81f8dca 100644 --- a/arch/um/kernel/initrd.c +++ b/arch/um/kernel/initrd.c @@ -37,8 +37,6 @@ static int __init read_initrd(void) } area = alloc_bootmem(size); - if (area == NULL) - return 0; if (load_initrd(initrd, area, size) == -1) return 0; diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 16630e75f056..e8175a8aa22c 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -319,9 +319,6 @@ int __init linux_main(int argc, char **argv) start_vm = VMALLOC_START; - setup_physmem(uml_physmem, uml_reserved, physmem_size, highmem); - mem_total_pages(physmem_size, iomem_size, highmem); - virtmem_size = physmem_size; stack = (unsigned long) argv; stack &= ~(1024 * 1024 - 1); @@ -334,7 +331,6 @@ int __init linux_main(int argc, char **argv) printf("Kernel virtual memory size shrunk to %lu bytes\n", virtmem_size); - stack_protections((unsigned long) &init_thread_info); os_flush_stdout(); return start_uml(); @@ -342,6 +338,10 @@ int __init linux_main(int argc, char **argv) void __init setup_arch(char **cmdline_p) { + stack_protections((unsigned long) &init_thread_info); + setup_physmem(uml_physmem, uml_reserved, physmem_size, highmem); + mem_total_pages(physmem_size, iomem_size, highmem); + paging_init(); strlcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile index 08ff5094fcdd..ada473bf6f46 100644 --- a/arch/um/os-Linux/Makefile +++ b/arch/um/os-Linux/Makefile @@ -3,6 +3,9 @@ # Licensed under the GPL # +# Don't instrument UML-specific code +KCOV_INSTRUMENT := n + obj-y = aio.o execvp.o file.o helper.o irq.o main.o mem.o process.o \ registers.o sigio.o signal.o start_up.o time.o tty.o \ umid.o user_syms.o util.o drivers/ skas/ diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 8acaf4e384c0..a86d7cc2c2d8 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -15,6 +15,7 @@ #include <kern_util.h> #include <os.h> #include <sysdep/mcontext.h> +#include <um_malloc.h> void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = { [SIGTRAP] = relay_signal, @@ -32,7 +33,7 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) struct uml_pt_regs *r; int save_errno = errno; - r = malloc(sizeof(struct uml_pt_regs)); + r = uml_kmalloc(sizeof(struct uml_pt_regs), UM_GFP_ATOMIC); if (!r) panic("out of memory"); @@ -91,7 +92,7 @@ static void timer_real_alarm_handler(mcontext_t *mc) { struct uml_pt_regs *regs; - regs = malloc(sizeof(struct uml_pt_regs)); + regs = uml_kmalloc(sizeof(struct uml_pt_regs), UM_GFP_ATOMIC); if (!regs) panic("out of memory"); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3a9add58d794..5c6e7471b732 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -152,6 +152,7 @@ config X86 select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION select PERF_EVENTS select RTC_LIB + select RTC_MC146818_LIB select SPARSE_IRQ select SRCU select SYSCTL_EXCEPTION_TRACE diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h index 0f555cc31984..24acd9ba7837 100644 --- a/arch/x86/include/asm/mc146818rtc.h +++ b/arch/x86/include/asm/mc146818rtc.h @@ -6,7 +6,6 @@ #include <asm/io.h> #include <asm/processor.h> -#include <linux/mc146818rtc.h> #ifndef RTC_PORT #define RTC_PORT(x) (0x70 + (x)) diff --git a/arch/x86/include/asm/rtc.h b/arch/x86/include/asm/rtc.h deleted file mode 100644 index f71c3b0ed360..000000000000 --- a/arch/x86/include/asm/rtc.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/rtc.h> diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 3d747070fe67..ed16e58658a4 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1019,7 +1019,6 @@ void hpet_disable(void) */ #include <linux/mc146818rtc.h> #include <linux/rtc.h> -#include <asm/rtc.h> #define DEFAULT_RTC_INT_FREQ 64 #define DEFAULT_RTC_SHIFT 6 @@ -1243,7 +1242,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) memset(&curr_time, 0, sizeof(struct rtc_time)); if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) - get_rtc_time(&curr_time); + mc146818_set_time(&curr_time); if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 04b132a767f1..bfe4d6c96fbd 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -17,6 +17,7 @@ #include <linux/debugfs.h> #include <linux/delay.h> #include <linux/hardirq.h> +#include <linux/ratelimit.h> #include <linux/slab.h> #include <linux/export.h> diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index eceaa082ec3f..79c6311cd912 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -13,7 +13,6 @@ #include <asm/x86_init.h> #include <asm/time.h> #include <asm/intel-mid.h> -#include <asm/rtc.h> #include <asm/setup.h> #ifdef CONFIG_X86_32 @@ -47,7 +46,7 @@ int mach_set_rtc_mmss(const struct timespec *now) rtc_time_to_tm(nowtime, &tm); if (!rtc_valid_tm(&tm)) { - retval = set_rtc_time(&tm); + retval = mc146818_set_time(&tm); if (retval) printk(KERN_ERR "%s: RTC write failed with error %d\n", __func__, retval); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 17c8bbd4e2f0..1fbb408e2e72 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -51,7 +51,6 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/x86_init.h> -#include <asm/rtc.h> #include <asm/uv/uv.h> static struct efi efi_phys __initdata; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 04db6fbce96d..677e29e29473 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -25,6 +25,7 @@ #include <linux/bootmem.h> #include <linux/ioport.h> #include <linux/init.h> +#include <linux/mc146818rtc.h> #include <linux/efi.h> #include <linux/uaccess.h> #include <linux/io.h> diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c index ee40fcb6e54d..58024862a7eb 100644 --- a/arch/x86/platform/intel-mid/intel_mid_vrtc.c +++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c @@ -22,6 +22,7 @@ #include <linux/init.h> #include <linux/sfi.h> #include <linux/platform_device.h> +#include <linux/mc146818rtc.h> #include <asm/intel-mid.h> #include <asm/intel_mid_vrtc.h> diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index f2b5e6a5cf95..f0b5f2d402af 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -37,11 +37,11 @@ unsigned long jump_address_phys; */ unsigned long restore_cr3 __visible; -pgd_t *temp_level4_pgt __visible; +unsigned long temp_level4_pgt __visible; unsigned long relocated_restore_code __visible; -static int set_up_temporary_text_mapping(void) +static int set_up_temporary_text_mapping(pgd_t *pgd) { pmd_t *pmd; pud_t *pud; @@ -71,7 +71,7 @@ static int set_up_temporary_text_mapping(void) __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); set_pud(pud + pud_index(restore_jump_address), __pud(__pa(pmd) | _KERNPG_TABLE)); - set_pgd(temp_level4_pgt + pgd_index(restore_jump_address), + set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(pud) | _KERNPG_TABLE)); return 0; @@ -90,15 +90,16 @@ static int set_up_temporary_mappings(void) .kernel_mapping = true, }; unsigned long mstart, mend; + pgd_t *pgd; int result; int i; - temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); - if (!temp_level4_pgt) + pgd = (pgd_t *)get_safe_page(GFP_ATOMIC); + if (!pgd) return -ENOMEM; /* Prepare a temporary mapping for the kernel text */ - result = set_up_temporary_text_mapping(); + result = set_up_temporary_text_mapping(pgd); if (result) return result; @@ -107,13 +108,12 @@ static int set_up_temporary_mappings(void) mstart = pfn_mapped[i].start << PAGE_SHIFT; mend = pfn_mapped[i].end << PAGE_SHIFT; - result = kernel_ident_mapping_init(&info, temp_level4_pgt, - mstart, mend); - + result = kernel_ident_mapping_init(&info, pgd, mstart, mend); if (result) return result; } + temp_level4_pgt = (unsigned long)pgd - __PAGE_OFFSET; return 0; } diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index 8eee0e9c93f0..ce8da3a0412c 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -72,8 +72,6 @@ ENTRY(restore_image) /* code below has been relocated to a safe page */ ENTRY(core_restore_code) /* switch to temporary page tables */ - movq $__PAGE_OFFSET, %rcx - subq %rcx, %rax movq %rax, %cr3 /* flush TLB */ movq %rbx, %rcx diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile index 6c803ca49b5d..d72dec406ccb 100644 --- a/arch/x86/um/vdso/Makefile +++ b/arch/x86/um/vdso/Makefile @@ -2,6 +2,9 @@ # Building vDSO images for x86. # +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + VDSO64-y := y vdso-install-$(VDSO64-y) += vdso.so diff --git a/drivers/acpi/acpi_cmos_rtc.c b/drivers/acpi/acpi_cmos_rtc.c index 81dc75033f15..0980a133916f 100644 --- a/drivers/acpi/acpi_cmos_rtc.c +++ b/drivers/acpi/acpi_cmos_rtc.c @@ -14,7 +14,7 @@ #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> -#include <asm-generic/rtc.h> +#include <linux/mc146818rtc.h> #include "internal.h" diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c index 148f4e5ca104..31abb0bdd4f2 100644 --- a/drivers/acpi/button.c +++ b/drivers/acpi/button.c @@ -232,8 +232,10 @@ remove_dev_dir: acpi_device_dir(device) = NULL; remove_lid_dir: remove_proc_entry(ACPI_BUTTON_SUBCLASS_LID, acpi_button_dir); + acpi_lid_dir = NULL; remove_button_dir: remove_proc_entry(ACPI_BUTTON_CLASS, acpi_root_dir); + acpi_button_dir = NULL; goto done; } @@ -250,7 +252,9 @@ static int acpi_button_remove_fs(struct acpi_device *device) acpi_lid_dir); acpi_device_dir(device) = NULL; remove_proc_entry(ACPI_BUTTON_SUBCLASS_LID, acpi_button_dir); + acpi_lid_dir = NULL; remove_proc_entry(ACPI_BUTTON_CLASS, acpi_root_dir); + acpi_button_dir = NULL; return 0; } diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 999a10914678..e7bd57cc550a 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -101,6 +101,7 @@ enum ec_command { #define ACPI_EC_UDELAY_POLL 550 /* Wait 1ms for EC transaction polling */ #define ACPI_EC_CLEAR_MAX 100 /* Maximum number of events to query * when trying to clear the EC */ +#define ACPI_EC_MAX_QUERIES 16 /* Maximum number of parallel queries */ enum { EC_FLAGS_QUERY_PENDING, /* Query is pending */ @@ -121,6 +122,10 @@ static unsigned int ec_delay __read_mostly = ACPI_EC_DELAY; module_param(ec_delay, uint, 0644); MODULE_PARM_DESC(ec_delay, "Timeout(ms) waited until an EC command completes"); +static unsigned int ec_max_queries __read_mostly = ACPI_EC_MAX_QUERIES; +module_param(ec_max_queries, uint, 0644); +MODULE_PARM_DESC(ec_max_queries, "Maximum parallel _Qxx evaluations"); + static bool ec_busy_polling __read_mostly; module_param(ec_busy_polling, bool, 0644); MODULE_PARM_DESC(ec_busy_polling, "Use busy polling to advance EC transaction"); @@ -174,6 +179,7 @@ static void acpi_ec_event_processor(struct work_struct *work); struct acpi_ec *boot_ec, *first_ec; EXPORT_SYMBOL(first_ec); +static struct workqueue_struct *ec_query_wq; static int EC_FLAGS_CLEAR_ON_RESUME; /* Needs acpi_ec_clear() on boot/resume */ static int EC_FLAGS_QUERY_HANDSHAKE; /* Needs QR_EC issued when SCI_EVT set */ @@ -1098,7 +1104,7 @@ static int acpi_ec_query(struct acpi_ec *ec, u8 *data) * work queue execution. */ ec_dbg_evt("Query(0x%02x) scheduled", value); - if (!schedule_work(&q->work)) { + if (!queue_work(ec_query_wq, &q->work)) { ec_dbg_evt("Query(0x%02x) overlapped", value); result = -EBUSY; } @@ -1660,15 +1666,41 @@ static struct acpi_driver acpi_ec_driver = { }, }; +static inline int acpi_ec_query_init(void) +{ + if (!ec_query_wq) { + ec_query_wq = alloc_workqueue("kec_query", 0, + ec_max_queries); + if (!ec_query_wq) + return -ENODEV; + } + return 0; +} + +static inline void acpi_ec_query_exit(void) +{ + if (ec_query_wq) { + destroy_workqueue(ec_query_wq); + ec_query_wq = NULL; + } +} + int __init acpi_ec_init(void) { - int result = 0; + int result; + /* register workqueue for _Qxx evaluations */ + result = acpi_ec_query_init(); + if (result) + goto err_exit; /* Now register the driver for the EC */ result = acpi_bus_register_driver(&acpi_ec_driver); - if (result < 0) - return -ENODEV; + if (result) + goto err_exit; +err_exit: + if (result) + acpi_ec_query_exit(); return result; } @@ -1678,5 +1710,6 @@ static void __exit acpi_ec_exit(void) { acpi_bus_unregister_driver(&acpi_ec_driver); + acpi_ec_query_exit(); } #endif /* 0 */ diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c index 7c04c87738a6..df0c70963d9e 100644 --- a/drivers/base/power/opp/core.c +++ b/drivers/base/power/opp/core.c @@ -402,6 +402,22 @@ struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev, } EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_exact); +static noinline struct dev_pm_opp *_find_freq_ceil(struct opp_table *opp_table, + unsigned long *freq) +{ + struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE); + + list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) { + if (temp_opp->available && temp_opp->rate >= *freq) { + opp = temp_opp; + *freq = opp->rate; + break; + } + } + + return opp; +} + /** * dev_pm_opp_find_freq_ceil() - Search for an rounded ceil freq * @dev: device for which we do this operation @@ -427,7 +443,6 @@ struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev, unsigned long *freq) { struct opp_table *opp_table; - struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE); opp_rcu_lockdep_assert(); @@ -440,15 +455,7 @@ struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev, if (IS_ERR(opp_table)) return ERR_CAST(opp_table); - list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) { - if (temp_opp->available && temp_opp->rate >= *freq) { - opp = temp_opp; - *freq = opp->rate; - break; - } - } - - return opp; + return _find_freq_ceil(opp_table, freq); } EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_ceil); @@ -612,7 +619,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) return PTR_ERR(opp_table); } - old_opp = dev_pm_opp_find_freq_ceil(dev, &old_freq); + old_opp = _find_freq_ceil(opp_table, &old_freq); if (!IS_ERR(old_opp)) { ou_volt = old_opp->u_volt; ou_volt_min = old_opp->u_volt_min; @@ -622,7 +629,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) __func__, old_freq, PTR_ERR(old_opp)); } - opp = dev_pm_opp_find_freq_ceil(dev, &freq); + opp = _find_freq_ceil(opp_table, &freq); if (IS_ERR(opp)) { ret = PTR_ERR(opp); dev_err(dev, "%s: failed to find OPP for freq %lu (%d)\n", diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c index a6975795e7f3..efec10b49d59 100644 --- a/drivers/base/power/trace.c +++ b/drivers/base/power/trace.c @@ -11,7 +11,7 @@ #include <linux/export.h> #include <linux/rtc.h> -#include <asm/rtc.h> +#include <linux/mc146818rtc.h> #include "power.h" @@ -103,7 +103,7 @@ static int set_magic_time(unsigned int user, unsigned int file, unsigned int dev n /= 24; time.tm_min = (n % 20) * 3; n /= 20; - set_rtc_time(&time); + mc146818_set_time(&time); return n ? -1 : 0; } @@ -112,7 +112,7 @@ static unsigned int read_magic_time(void) struct rtc_time time; unsigned int val; - get_rtc_time(&time); + mc146818_get_time(&time); pr_info("RTC time: %2d:%02d:%02d, date: %02d/%02d/%02d\n", time.tm_hour, time.tm_min, time.tm_sec, time.tm_mon + 1, time.tm_mday, time.tm_year % 100); diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 5fb7718f256c..62e4de2aa8d1 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -334,10 +334,9 @@ void device_wakeup_arm_wake_irqs(void) struct wakeup_source *ws; rcu_read_lock(); - list_for_each_entry_rcu(ws, &wakeup_sources, entry) { - if (ws->wakeirq) - dev_pm_arm_wake_irq(ws->wakeirq); - } + list_for_each_entry_rcu(ws, &wakeup_sources, entry) + dev_pm_arm_wake_irq(ws->wakeirq); + rcu_read_unlock(); } @@ -351,10 +350,9 @@ void device_wakeup_disarm_wake_irqs(void) struct wakeup_source *ws; rcu_read_lock(); - list_for_each_entry_rcu(ws, &wakeup_sources, entry) { - if (ws->wakeirq) - dev_pm_disarm_wake_irq(ws->wakeirq); - } + list_for_each_entry_rcu(ws, &wakeup_sources, entry) + dev_pm_disarm_wake_irq(ws->wakeirq); + rcu_read_unlock(); } @@ -390,9 +388,7 @@ int device_wakeup_disable(struct device *dev) return -EINVAL; ws = device_wakeup_detach(dev); - if (ws) - wakeup_source_unregister(ws); - + wakeup_source_unregister(ws); return 0; } EXPORT_SYMBOL_GPL(device_wakeup_disable); diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index fdb8f3e10b6f..dcc09739a54e 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -293,7 +293,7 @@ if RTC_LIB=n config RTC tristate "Enhanced Real Time Clock Support (legacy PC RTC driver)" - depends on ALPHA || (MIPS && MACH_LOONGSON64) || MN10300 + depends on ALPHA || (MIPS && MACH_LOONGSON64) ---help--- If you say Y here and create a character special file /dev/rtc with major number 10 and minor number 135 using mknod ("man mknod"), you @@ -339,32 +339,6 @@ config JS_RTC To compile this driver as a module, choose M here: the module will be called js-rtc. -config GEN_RTC - tristate "Generic /dev/rtc emulation" - depends on RTC!=y - depends on ALPHA || M68K || MN10300 || PARISC || PPC || X86 - ---help--- - If you say Y here and create a character special file /dev/rtc with - major number 10 and minor number 135 using mknod ("man mknod"), you - will get access to the real time clock (or hardware clock) built - into your computer. - - It reports status information via the file /proc/driver/rtc and its - behaviour is set by various ioctls on /dev/rtc. If you enable the - "extended RTC operation" below it will also provide an emulation - for RTC_UIE which is required by some programs and may improve - precision in some cases. - - To compile this driver as a module, choose M here: the - module will be called genrtc. - -config GEN_RTC_X - bool "Extended RTC operation" - depends on GEN_RTC - help - Provides an emulation for RTC_UIE which is required by some programs - and may improve precision of the generic RTC support in some cases. - config EFI_RTC bool "EFI Real Time Clock Services" depends on IA64 diff --git a/drivers/char/Makefile b/drivers/char/Makefile index 55d16bf3ccc5..6e6c244a66a0 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -25,7 +25,6 @@ obj-$(CONFIG_APPLICOM) += applicom.o obj-$(CONFIG_SONYPI) += sonypi.o obj-$(CONFIG_RTC) += rtc.o obj-$(CONFIG_HPET) += hpet.o -obj-$(CONFIG_GEN_RTC) += genrtc.o obj-$(CONFIG_EFI_RTC) += efirtc.o obj-$(CONFIG_DS1302) += ds1302.o obj-$(CONFIG_XILINX_HWICAP) += xilinx_hwicap/ diff --git a/drivers/char/genrtc.c b/drivers/char/genrtc.c deleted file mode 100644 index 4f943759d376..000000000000 --- a/drivers/char/genrtc.c +++ /dev/null @@ -1,539 +0,0 @@ -/* - * Real Time Clock interface for - * - q40 and other m68k machines, - * - HP PARISC machines - * - PowerPC machines - * emulate some RTC irq capabilities in software - * - * Copyright (C) 1999 Richard Zidlicky - * - * based on Paul Gortmaker's rtc.c device and - * Sam Creasey Generic rtc driver - * - * This driver allows use of the real time clock (built into - * nearly all computers) from user space. It exports the /dev/rtc - * interface supporting various ioctl() and also the /proc/driver/rtc - * pseudo-file for status information. - * - * The ioctls can be used to set the interrupt behaviour where - * supported. - * - * The /dev/rtc interface will block on reads until an interrupt - * has been received. If a RTC interrupt has already happened, - * it will output an unsigned long and then block. The output value - * contains the interrupt status in the low byte and the number of - * interrupts since the last read in the remaining high bytes. The - * /dev/rtc interface can also be used with the select(2) call. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - - * 1.01 fix for 2.3.X rz@linux-m68k.org - * 1.02 merged with code from genrtc.c rz@linux-m68k.org - * 1.03 make it more portable zippel@linux-m68k.org - * 1.04 removed useless timer code rz@linux-m68k.org - * 1.05 portable RTC_UIE emulation rz@linux-m68k.org - * 1.06 set_rtc_time can return an error trini@kernel.crashing.org - * 1.07 ported to HP PARISC (hppa) Helge Deller <deller@gmx.de> - */ - -#define RTC_VERSION "1.07" - -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/errno.h> -#include <linux/miscdevice.h> -#include <linux/fcntl.h> - -#include <linux/rtc.h> -#include <linux/init.h> -#include <linux/poll.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/mutex.h> -#include <linux/workqueue.h> - -#include <asm/uaccess.h> -#include <asm/rtc.h> - -/* - * We sponge a minor off of the misc major. No need slurping - * up another valuable major dev number for this. If you add - * an ioctl, make sure you don't conflict with SPARC's RTC - * ioctls. - */ - -static DEFINE_MUTEX(gen_rtc_mutex); -static DECLARE_WAIT_QUEUE_HEAD(gen_rtc_wait); - -/* - * Bits in gen_rtc_status. - */ - -#define RTC_IS_OPEN 0x01 /* means /dev/rtc is in use */ - -static unsigned char gen_rtc_status; /* bitmapped status byte. */ -static unsigned long gen_rtc_irq_data; /* our output to the world */ - -/* months start at 0 now */ -static unsigned char days_in_mo[] = -{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}; - -static int irq_active; - -#ifdef CONFIG_GEN_RTC_X -static struct work_struct genrtc_task; -static struct timer_list timer_task; - -static unsigned int oldsecs; -static int lostint; -static unsigned long tt_exp; - -static void gen_rtc_timer(unsigned long data); - -static volatile int stask_active; /* schedule_work */ -static volatile int ttask_active; /* timer_task */ -static int stop_rtc_timers; /* don't requeue tasks */ -static DEFINE_SPINLOCK(gen_rtc_lock); - -static void gen_rtc_interrupt(unsigned long arg); - -/* - * Routine to poll RTC seconds field for change as often as possible, - * after first RTC_UIE use timer to reduce polling - */ -static void genrtc_troutine(struct work_struct *work) -{ - unsigned int tmp = get_rtc_ss(); - - if (stop_rtc_timers) { - stask_active = 0; - return; - } - - if (oldsecs != tmp){ - oldsecs = tmp; - - timer_task.function = gen_rtc_timer; - timer_task.expires = jiffies + HZ - (HZ/10); - tt_exp=timer_task.expires; - ttask_active=1; - stask_active=0; - add_timer(&timer_task); - - gen_rtc_interrupt(0); - } else if (schedule_work(&genrtc_task) == 0) - stask_active = 0; -} - -static void gen_rtc_timer(unsigned long data) -{ - lostint = get_rtc_ss() - oldsecs ; - if (lostint<0) - lostint = 60 - lostint; - if (time_after(jiffies, tt_exp)) - printk(KERN_INFO "genrtc: timer task delayed by %ld jiffies\n", - jiffies-tt_exp); - ttask_active=0; - stask_active=1; - if ((schedule_work(&genrtc_task) == 0)) - stask_active = 0; -} - -/* - * call gen_rtc_interrupt function to signal an RTC_UIE, - * arg is unused. - * Could be invoked either from a real interrupt handler or - * from some routine that periodically (eg 100HZ) monitors - * whether RTC_SECS changed - */ -static void gen_rtc_interrupt(unsigned long arg) -{ - /* We store the status in the low byte and the number of - * interrupts received since the last read in the remainder - * of rtc_irq_data. */ - - gen_rtc_irq_data += 0x100; - gen_rtc_irq_data &= ~0xff; - gen_rtc_irq_data |= RTC_UIE; - - if (lostint){ - printk("genrtc: system delaying clock ticks?\n"); - /* increment count so that userspace knows something is wrong */ - gen_rtc_irq_data += ((lostint-1)<<8); - lostint = 0; - } - - wake_up_interruptible(&gen_rtc_wait); -} - -/* - * Now all the various file operations that we export. - */ -static ssize_t gen_rtc_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - unsigned long data; - ssize_t retval; - - if (count != sizeof (unsigned int) && count != sizeof (unsigned long)) - return -EINVAL; - - if (file->f_flags & O_NONBLOCK && !gen_rtc_irq_data) - return -EAGAIN; - - retval = wait_event_interruptible(gen_rtc_wait, - (data = xchg(&gen_rtc_irq_data, 0))); - if (retval) - goto out; - - /* first test allows optimizer to nuke this case for 32-bit machines */ - if (sizeof (int) != sizeof (long) && count == sizeof (unsigned int)) { - unsigned int uidata = data; - retval = put_user(uidata, (unsigned int __user *)buf) ?: - sizeof(unsigned int); - } - else { - retval = put_user(data, (unsigned long __user *)buf) ?: - sizeof(unsigned long); - } -out: - return retval; -} - -static unsigned int gen_rtc_poll(struct file *file, - struct poll_table_struct *wait) -{ - poll_wait(file, &gen_rtc_wait, wait); - if (gen_rtc_irq_data != 0) - return POLLIN | POLLRDNORM; - return 0; -} - -#endif - -/* - * Used to disable/enable interrupts, only RTC_UIE supported - * We also clear out any old irq data after an ioctl() that - * meddles with the interrupt enable/disable bits. - */ - -static inline void gen_clear_rtc_irq_bit(unsigned char bit) -{ -#ifdef CONFIG_GEN_RTC_X - stop_rtc_timers = 1; - if (ttask_active){ - del_timer_sync(&timer_task); - ttask_active = 0; - } - while (stask_active) - schedule(); - - spin_lock(&gen_rtc_lock); - irq_active = 0; - spin_unlock(&gen_rtc_lock); -#endif -} - -static inline int gen_set_rtc_irq_bit(unsigned char bit) -{ -#ifdef CONFIG_GEN_RTC_X - spin_lock(&gen_rtc_lock); - if ( !irq_active ) { - irq_active = 1; - stop_rtc_timers = 0; - lostint = 0; - INIT_WORK(&genrtc_task, genrtc_troutine); - oldsecs = get_rtc_ss(); - init_timer(&timer_task); - - stask_active = 1; - if (schedule_work(&genrtc_task) == 0){ - stask_active = 0; - } - } - spin_unlock(&gen_rtc_lock); - gen_rtc_irq_data = 0; - return 0; -#else - return -EINVAL; -#endif -} - -static int gen_rtc_ioctl(struct file *file, - unsigned int cmd, unsigned long arg) -{ - struct rtc_time wtime; - struct rtc_pll_info pll; - void __user *argp = (void __user *)arg; - - switch (cmd) { - - case RTC_PLL_GET: - if (get_rtc_pll(&pll)) - return -EINVAL; - else - return copy_to_user(argp, &pll, sizeof pll) ? -EFAULT : 0; - - case RTC_PLL_SET: - if (!capable(CAP_SYS_TIME)) - return -EACCES; - if (copy_from_user(&pll, argp, sizeof(pll))) - return -EFAULT; - return set_rtc_pll(&pll); - - case RTC_UIE_OFF: /* disable ints from RTC updates. */ - gen_clear_rtc_irq_bit(RTC_UIE); - return 0; - - case RTC_UIE_ON: /* enable ints for RTC updates. */ - return gen_set_rtc_irq_bit(RTC_UIE); - - case RTC_RD_TIME: /* Read the time/date from RTC */ - /* this doesn't get week-day, who cares */ - memset(&wtime, 0, sizeof(wtime)); - get_rtc_time(&wtime); - - return copy_to_user(argp, &wtime, sizeof(wtime)) ? -EFAULT : 0; - - case RTC_SET_TIME: /* Set the RTC */ - { - int year; - unsigned char leap_yr; - - if (!capable(CAP_SYS_TIME)) - return -EACCES; - - if (copy_from_user(&wtime, argp, sizeof(wtime))) - return -EFAULT; - - year = wtime.tm_year + 1900; - leap_yr = ((!(year % 4) && (year % 100)) || - !(year % 400)); - - if ((wtime.tm_mon < 0 || wtime.tm_mon > 11) || (wtime.tm_mday < 1)) - return -EINVAL; - - if (wtime.tm_mday < 0 || wtime.tm_mday > - (days_in_mo[wtime.tm_mon] + ((wtime.tm_mon == 1) && leap_yr))) - return -EINVAL; - - if (wtime.tm_hour < 0 || wtime.tm_hour >= 24 || - wtime.tm_min < 0 || wtime.tm_min >= 60 || - wtime.tm_sec < 0 || wtime.tm_sec >= 60) - return -EINVAL; - - return set_rtc_time(&wtime); - } - } - - return -EINVAL; -} - -static long gen_rtc_unlocked_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - int ret; - - mutex_lock(&gen_rtc_mutex); - ret = gen_rtc_ioctl(file, cmd, arg); - mutex_unlock(&gen_rtc_mutex); - - return ret; -} - -/* - * We enforce only one user at a time here with the open/close. - * Also clear the previous interrupt data on an open, and clean - * up things on a close. - */ - -static int gen_rtc_open(struct inode *inode, struct file *file) -{ - mutex_lock(&gen_rtc_mutex); - if (gen_rtc_status & RTC_IS_OPEN) { - mutex_unlock(&gen_rtc_mutex); - return -EBUSY; - } - - gen_rtc_status |= RTC_IS_OPEN; - gen_rtc_irq_data = 0; - irq_active = 0; - mutex_unlock(&gen_rtc_mutex); - - return 0; -} - -static int gen_rtc_release(struct inode *inode, struct file *file) -{ - /* - * Turn off all interrupts once the device is no longer - * in use and clear the data. - */ - - gen_clear_rtc_irq_bit(RTC_PIE|RTC_AIE|RTC_UIE); - - gen_rtc_status &= ~RTC_IS_OPEN; - return 0; -} - - -#ifdef CONFIG_PROC_FS - -/* - * Info exported via "/proc/driver/rtc". - */ - -static int gen_rtc_proc_show(struct seq_file *m, void *v) -{ - struct rtc_time tm; - unsigned int flags; - struct rtc_pll_info pll; - - flags = get_rtc_time(&tm); - - seq_printf(m, - "rtc_time\t: %02d:%02d:%02d\n" - "rtc_date\t: %04d-%02d-%02d\n" - "rtc_epoch\t: %04u\n", - tm.tm_hour, tm.tm_min, tm.tm_sec, - tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, 1900); - - tm.tm_hour = tm.tm_min = tm.tm_sec = 0; - - seq_puts(m, "alarm\t\t: "); - if (tm.tm_hour <= 24) - seq_printf(m, "%02d:", tm.tm_hour); - else - seq_puts(m, "**:"); - - if (tm.tm_min <= 59) - seq_printf(m, "%02d:", tm.tm_min); - else - seq_puts(m, "**:"); - - if (tm.tm_sec <= 59) - seq_printf(m, "%02d\n", tm.tm_sec); - else - seq_puts(m, "**\n"); - - seq_printf(m, - "DST_enable\t: %s\n" - "BCD\t\t: %s\n" - "24hr\t\t: %s\n" - "square_wave\t: %s\n" - "alarm_IRQ\t: %s\n" - "update_IRQ\t: %s\n" - "periodic_IRQ\t: %s\n" - "periodic_freq\t: %ld\n" - "batt_status\t: %s\n", - (flags & RTC_DST_EN) ? "yes" : "no", - (flags & RTC_DM_BINARY) ? "no" : "yes", - (flags & RTC_24H) ? "yes" : "no", - (flags & RTC_SQWE) ? "yes" : "no", - (flags & RTC_AIE) ? "yes" : "no", - irq_active ? "yes" : "no", - (flags & RTC_PIE) ? "yes" : "no", - 0L /* freq */, - (flags & RTC_BATT_BAD) ? "bad" : "okay"); - if (!get_rtc_pll(&pll)) - seq_printf(m, - "PLL adjustment\t: %d\n" - "PLL max +ve adjustment\t: %d\n" - "PLL max -ve adjustment\t: %d\n" - "PLL +ve adjustment factor\t: %d\n" - "PLL -ve adjustment factor\t: %d\n" - "PLL frequency\t: %ld\n", - pll.pll_value, - pll.pll_max, - pll.pll_min, - pll.pll_posmult, - pll.pll_negmult, - pll.pll_clock); - return 0; -} - -static int gen_rtc_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, gen_rtc_proc_show, NULL); -} - -static const struct file_operations gen_rtc_proc_fops = { - .open = gen_rtc_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init gen_rtc_proc_init(void) -{ - struct proc_dir_entry *r; - - r = proc_create("driver/rtc", 0, NULL, &gen_rtc_proc_fops); - if (!r) - return -ENOMEM; - return 0; -} -#else -static inline int gen_rtc_proc_init(void) { return 0; } -#endif /* CONFIG_PROC_FS */ - - -/* - * The various file operations we support. - */ - -static const struct file_operations gen_rtc_fops = { - .owner = THIS_MODULE, -#ifdef CONFIG_GEN_RTC_X - .read = gen_rtc_read, - .poll = gen_rtc_poll, -#endif - .unlocked_ioctl = gen_rtc_unlocked_ioctl, - .open = gen_rtc_open, - .release = gen_rtc_release, - .llseek = noop_llseek, -}; - -static struct miscdevice rtc_gen_dev = -{ - .minor = RTC_MINOR, - .name = "rtc", - .fops = &gen_rtc_fops, -}; - -static int __init rtc_generic_init(void) -{ - int retval; - - printk(KERN_INFO "Generic RTC Driver v%s\n", RTC_VERSION); - - retval = misc_register(&rtc_gen_dev); - if (retval < 0) - return retval; - - retval = gen_rtc_proc_init(); - if (retval) { - misc_deregister(&rtc_gen_dev); - return retval; - } - - return 0; -} - -static void __exit rtc_generic_exit(void) -{ - remove_proc_entry ("driver/rtc", NULL); - misc_deregister(&rtc_gen_dev); -} - - -module_init(rtc_generic_init); -module_exit(rtc_generic_exit); - -MODULE_AUTHOR("Richard Zidlicky"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_MISCDEV(RTC_MINOR); diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index c822d72629d5..74919aa81dcb 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -32,7 +32,6 @@ config CPU_FREQ_BOOST_SW config CPU_FREQ_STAT bool "CPU frequency transition statistics" - default y help Export CPU frequency statistics information through sysfs. diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 9ec033b4f2d9..be9eade147f2 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1374,6 +1374,8 @@ MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids); static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = { ICPU(INTEL_FAM6_BROADWELL_XEON_D, core_params), + ICPU(INTEL_FAM6_BROADWELL_X, core_params), + ICPU(INTEL_FAM6_SKYLAKE_X, core_params), {} }; diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 2137adfbd8c3..e9b7dc037ff8 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -84,6 +84,7 @@ source "drivers/infiniband/ulp/iser/Kconfig" source "drivers/infiniband/ulp/isert/Kconfig" source "drivers/infiniband/sw/rdmavt/Kconfig" +source "drivers/infiniband/sw/rxe/Kconfig" source "drivers/infiniband/hw/hfi1/Kconfig" diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index ad1b1adcf6f0..e6dfa1bd3def 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -68,6 +68,7 @@ MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 +#define CMA_QUERY_CLASSPORT_INFO_TIMEOUT 3000 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 18 @@ -162,6 +163,14 @@ struct rdma_bind_list { unsigned short port; }; +struct class_port_info_context { + struct ib_class_port_info *class_port_info; + struct ib_device *device; + struct completion done; + struct ib_sa_query *sa_query; + u8 port_num; +}; + static int cma_ps_alloc(struct net *net, enum rdma_port_space ps, struct rdma_bind_list *bind_list, int snum) { @@ -306,6 +315,7 @@ struct cma_multicast { struct sockaddr_storage addr; struct kref mcref; bool igmp_joined; + u8 join_state; }; struct cma_work { @@ -3752,10 +3762,63 @@ static void cma_set_mgid(struct rdma_id_private *id_priv, } } +static void cma_query_sa_classport_info_cb(int status, + struct ib_class_port_info *rec, + void *context) +{ + struct class_port_info_context *cb_ctx = context; + + WARN_ON(!context); + + if (status || !rec) { + pr_debug("RDMA CM: %s port %u failed query ClassPortInfo status: %d\n", + cb_ctx->device->name, cb_ctx->port_num, status); + goto out; + } + + memcpy(cb_ctx->class_port_info, rec, sizeof(struct ib_class_port_info)); + +out: + complete(&cb_ctx->done); +} + +static int cma_query_sa_classport_info(struct ib_device *device, u8 port_num, + struct ib_class_port_info *class_port_info) +{ + struct class_port_info_context *cb_ctx; + int ret; + + cb_ctx = kmalloc(sizeof(*cb_ctx), GFP_KERNEL); + if (!cb_ctx) + return -ENOMEM; + + cb_ctx->device = device; + cb_ctx->class_port_info = class_port_info; + cb_ctx->port_num = port_num; + init_completion(&cb_ctx->done); + + ret = ib_sa_classport_info_rec_query(&sa_client, device, port_num, + CMA_QUERY_CLASSPORT_INFO_TIMEOUT, + GFP_KERNEL, cma_query_sa_classport_info_cb, + cb_ctx, &cb_ctx->sa_query); + if (ret < 0) { + pr_err("RDMA CM: %s port %u failed to send ClassPortInfo query, ret: %d\n", + device->name, port_num, ret); + goto out; + } + + wait_for_completion(&cb_ctx->done); + +out: + kfree(cb_ctx); + return ret; +} + static int cma_join_ib_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct ib_sa_mcmember_rec rec; + struct ib_class_port_info class_port_info; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; ib_sa_comp_mask comp_mask; int ret; @@ -3774,7 +3837,24 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, rec.qkey = cpu_to_be32(id_priv->qkey); rdma_addr_get_sgid(dev_addr, &rec.port_gid); rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); - rec.join_state = 1; + rec.join_state = mc->join_state; + + if (rec.join_state == BIT(SENDONLY_FULLMEMBER_JOIN)) { + ret = cma_query_sa_classport_info(id_priv->id.device, + id_priv->id.port_num, + &class_port_info); + + if (ret) + return ret; + + if (!(ib_get_cpi_capmask2(&class_port_info) & + IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT)) { + pr_warn("RDMA CM: %s port %u Unable to multicast join\n" + "RDMA CM: SM doesn't support Send Only Full Member option\n", + id_priv->id.device->name, id_priv->id.port_num); + return -EOPNOTSUPP; + } + } comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | @@ -3843,6 +3923,9 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; enum ib_gid_type gid_type; + bool send_only; + + send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); if (cma_zero_addr((struct sockaddr *)&mc->addr)) return -EINVAL; @@ -3878,10 +3961,12 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, if (addr->sa_family == AF_INET) { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; - err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, - true); - if (!err) - mc->igmp_joined = true; + if (!send_only) { + err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, + true); + if (!err) + mc->igmp_joined = true; + } } } else { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) @@ -3911,7 +3996,7 @@ out1: } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, - void *context) + u8 join_state, void *context) { struct rdma_id_private *id_priv; struct cma_multicast *mc; @@ -3930,6 +4015,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, mc->context = context; mc->id_priv = id_priv; mc->igmp_joined = false; + mc->join_state = join_state; spin_lock(&id_priv->lock); list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 5c155fa91eec..760ef603a468 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -311,6 +311,15 @@ static int read_port_immutable(struct ib_device *device) return 0; } +void ib_get_device_fw_str(struct ib_device *dev, char *str, size_t str_len) +{ + if (dev->get_dev_fw_str) + dev->get_dev_fw_str(dev, str, str_len); + else + str[0] = '\0'; +} +EXPORT_SYMBOL(ib_get_device_fw_str); + /** * ib_register_device - Register an IB device with IB core * @device:Device to register diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index f0572049d291..357624f8b9d3 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -183,15 +183,14 @@ static void free_cm_id(struct iwcm_id_private *cm_id_priv) /* * Release a reference on cm_id. If the last reference is being - * released, enable the waiting thread (in iw_destroy_cm_id) to - * get woken up, and return 1 if a thread is already waiting. + * released, free the cm_id and return 1. */ static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv) { BUG_ON(atomic_read(&cm_id_priv->refcount)==0); if (atomic_dec_and_test(&cm_id_priv->refcount)) { BUG_ON(!list_empty(&cm_id_priv->work_list)); - complete(&cm_id_priv->destroy_comp); + free_cm_id(cm_id_priv); return 1; } @@ -208,19 +207,10 @@ static void add_ref(struct iw_cm_id *cm_id) static void rem_ref(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; - int cb_destroy; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); - /* - * Test bit before deref in case the cm_id gets freed on another - * thread. - */ - cb_destroy = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); - if (iwcm_deref_id(cm_id_priv) && cb_destroy) { - BUG_ON(!list_empty(&cm_id_priv->work_list)); - free_cm_id(cm_id_priv); - } + (void)iwcm_deref_id(cm_id_priv); } static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event); @@ -370,6 +360,12 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) wait_event(cm_id_priv->connect_wait, !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags)); + /* + * Since we're deleting the cm_id, drop any events that + * might arrive before the last dereference. + */ + set_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags); + spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_LISTEN: @@ -433,13 +429,7 @@ void iw_destroy_cm_id(struct iw_cm_id *cm_id) struct iwcm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); - BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags)); - destroy_cm_id(cm_id); - - wait_for_completion(&cm_id_priv->destroy_comp); - - free_cm_id(cm_id_priv); } EXPORT_SYMBOL(iw_destroy_cm_id); @@ -809,10 +799,7 @@ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv, ret = cm_id->cm_handler(cm_id, iw_event); if (ret) { iw_cm_reject(cm_id, NULL, 0); - set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); - destroy_cm_id(cm_id); - if (atomic_read(&cm_id_priv->refcount)==0) - free_cm_id(cm_id_priv); + iw_destroy_cm_id(cm_id); } out: @@ -1000,7 +987,6 @@ static void cm_work_handler(struct work_struct *_work) unsigned long flags; int empty; int ret = 0; - int destroy_id; spin_lock_irqsave(&cm_id_priv->lock, flags); empty = list_empty(&cm_id_priv->work_list); @@ -1013,20 +999,14 @@ static void cm_work_handler(struct work_struct *_work) put_work(work); spin_unlock_irqrestore(&cm_id_priv->lock, flags); - ret = process_event(cm_id_priv, &levent); - if (ret) { - set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); - destroy_cm_id(&cm_id_priv->id); - } - BUG_ON(atomic_read(&cm_id_priv->refcount)==0); - destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); - if (iwcm_deref_id(cm_id_priv)) { - if (destroy_id) { - BUG_ON(!list_empty(&cm_id_priv->work_list)); - free_cm_id(cm_id_priv); - } + if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { + ret = process_event(cm_id_priv, &levent); + if (ret) + destroy_cm_id(&cm_id_priv->id); + } else + pr_debug("dropping event %d\n", levent.event); + if (iwcm_deref_id(cm_id_priv)) return; - } if (empty) return; spin_lock_irqsave(&cm_id_priv->lock, flags); diff --git a/drivers/infiniband/core/iwcm.h b/drivers/infiniband/core/iwcm.h index 3f6cc82564c8..82c2cd1b0a80 100644 --- a/drivers/infiniband/core/iwcm.h +++ b/drivers/infiniband/core/iwcm.h @@ -56,7 +56,7 @@ struct iwcm_id_private { struct list_head work_free_list; }; -#define IWCM_F_CALLBACK_DESTROY 1 +#define IWCM_F_DROP_EVENTS 1 #define IWCM_F_CONNECT_WAIT 2 #endif /* IWCM_H */ diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index b65e06c560d7..ade71e7f0131 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -37,6 +37,7 @@ #define IWPM_MAPINFO_HASH_MASK (IWPM_MAPINFO_HASH_SIZE - 1) #define IWPM_REMINFO_HASH_SIZE 64 #define IWPM_REMINFO_HASH_MASK (IWPM_REMINFO_HASH_SIZE - 1) +#define IWPM_MSG_SIZE 512 static LIST_HEAD(iwpm_nlmsg_req_list); static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock); @@ -452,7 +453,7 @@ struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh, { struct sk_buff *skb = NULL; - skb = dev_alloc_skb(NLMSG_GOODSIZE); + skb = dev_alloc_skb(IWPM_MSG_SIZE); if (!skb) { pr_err("%s Unable to allocate skb\n", __func__); goto create_nlmsg_exit; diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index a83ec28a147b..3a3c5d73bbfc 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -93,18 +93,6 @@ enum { struct mcast_member; -/* -* There are 4 types of join states: -* FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember. -*/ -enum { - FULLMEMBER_JOIN, - NONMEMBER_JOIN, - SENDONLY_NONMEBER_JOIN, - SENDONLY_FULLMEMBER_JOIN, - NUM_JOIN_MEMBERSHIP_TYPES, -}; - struct mcast_group { struct ib_sa_mcmember_rec rec; struct rb_node node; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 9b8c20c8209b..10469b0088b5 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -229,7 +229,10 @@ static void ibnl_rcv(struct sk_buff *skb) int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh, __u32 pid) { - return nlmsg_unicast(nls, skb, pid); + int err; + + err = netlink_unicast(nls, skb, pid, 0); + return (err < 0) ? err : 0; } EXPORT_SYMBOL(ibnl_unicast); @@ -252,6 +255,7 @@ int __init ibnl_init(void) return -ENOMEM; } + nls->sk_sndtimeo = 10 * HZ; return 0; } diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index 1eb9b1294a63..dbfd854c32c9 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -58,19 +58,13 @@ static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num, return false; } -static inline u32 rdma_rw_max_sge(struct ib_device *dev, - enum dma_data_direction dir) -{ - return dir == DMA_TO_DEVICE ? - dev->attrs.max_sge : dev->attrs.max_sge_rd; -} - static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev) { /* arbitrary limit to avoid allocating gigantic resources */ return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256); } +/* Caller must have zero-initialized *reg. */ static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num, struct rdma_rw_reg_ctx *reg, struct scatterlist *sg, u32 sg_cnt, u32 offset) @@ -114,6 +108,7 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { + struct rdma_rw_reg_ctx *prev = NULL; u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); int i, j, ret = 0, count = 0; @@ -125,7 +120,6 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, } for (i = 0; i < ctx->nr_ops; i++) { - struct rdma_rw_reg_ctx *prev = i ? &ctx->reg[i - 1] : NULL; struct rdma_rw_reg_ctx *reg = &ctx->reg[i]; u32 nents = min(sg_cnt, pages_per_mr); @@ -162,9 +156,13 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, sg_cnt -= nents; for (j = 0; j < nents; j++) sg = sg_next(sg); + prev = reg; offset = 0; } + if (prev) + prev->wr.wr.next = NULL; + ctx->type = RDMA_RW_MR; return count; @@ -181,7 +179,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { struct ib_device *dev = qp->pd->device; - u32 max_sge = rdma_rw_max_sge(dev, dir); + u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge : + qp->max_read_sge; struct ib_sge *sge; u32 total_len = 0, i, j; @@ -205,11 +204,10 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, rdma_wr->wr.opcode = IB_WR_RDMA_READ; rdma_wr->remote_addr = remote_addr + total_len; rdma_wr->rkey = rkey; + rdma_wr->wr.num_sge = nr_sge; rdma_wr->wr.sg_list = sge; for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) { - rdma_wr->wr.num_sge++; - sge->addr = ib_sg_dma_address(dev, sg) + offset; sge->length = ib_sg_dma_len(dev, sg) - offset; sge->lkey = qp->pd->local_dma_lkey; @@ -220,8 +218,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, offset = 0; } - if (i + 1 < ctx->nr_ops) - rdma_wr->wr.next = &ctx->map.wrs[i + 1].wr; + rdma_wr->wr.next = i + 1 < ctx->nr_ops ? + &ctx->map.wrs[i + 1].wr : NULL; } ctx->type = RDMA_RW_MULTI_WR; diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index e95538650dc6..b9bf7aa055e7 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -65,10 +65,17 @@ struct ib_sa_sm_ah { u8 src_path_mask; }; +struct ib_sa_classport_cache { + bool valid; + struct ib_class_port_info data; +}; + struct ib_sa_port { struct ib_mad_agent *agent; struct ib_sa_sm_ah *sm_ah; struct work_struct update_task; + struct ib_sa_classport_cache classport_info; + spinlock_t classport_lock; /* protects class port info set */ spinlock_t ah_lock; u8 port_num; }; @@ -998,6 +1005,13 @@ static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event port->sm_ah = NULL; spin_unlock_irqrestore(&port->ah_lock, flags); + if (event->event == IB_EVENT_SM_CHANGE || + event->event == IB_EVENT_CLIENT_REREGISTER || + event->event == IB_EVENT_LID_CHANGE) { + spin_lock_irqsave(&port->classport_lock, flags); + port->classport_info.valid = false; + spin_unlock_irqrestore(&port->classport_lock, flags); + } queue_work(ib_wq, &sa_dev->port[event->element.port_num - sa_dev->start_port].update_task); } @@ -1719,6 +1733,7 @@ static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) { + unsigned long flags; struct ib_sa_classport_info_query *query = container_of(sa_query, struct ib_sa_classport_info_query, sa_query); @@ -1728,6 +1743,16 @@ static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query, ib_unpack(classport_info_rec_table, ARRAY_SIZE(classport_info_rec_table), mad->data, &rec); + + spin_lock_irqsave(&sa_query->port->classport_lock, flags); + if (!status && !sa_query->port->classport_info.valid) { + memcpy(&sa_query->port->classport_info.data, &rec, + sizeof(sa_query->port->classport_info.data)); + + sa_query->port->classport_info.valid = true; + } + spin_unlock_irqrestore(&sa_query->port->classport_lock, flags); + query->callback(status, &rec, query->context); } else { query->callback(status, NULL, query->context); @@ -1754,7 +1779,9 @@ int ib_sa_classport_info_rec_query(struct ib_sa_client *client, struct ib_sa_port *port; struct ib_mad_agent *agent; struct ib_sa_mad *mad; + struct ib_class_port_info cached_class_port_info; int ret; + unsigned long flags; if (!sa_dev) return -ENODEV; @@ -1762,6 +1789,17 @@ int ib_sa_classport_info_rec_query(struct ib_sa_client *client, port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; + /* Use cached ClassPortInfo attribute if valid instead of sending mad */ + spin_lock_irqsave(&port->classport_lock, flags); + if (port->classport_info.valid && callback) { + memcpy(&cached_class_port_info, &port->classport_info.data, + sizeof(cached_class_port_info)); + spin_unlock_irqrestore(&port->classport_lock, flags); + callback(0, &cached_class_port_info, context); + return 0; + } + spin_unlock_irqrestore(&port->classport_lock, flags); + query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; @@ -1885,6 +1923,9 @@ static void ib_sa_add_one(struct ib_device *device) sa_dev->port[i].sm_ah = NULL; sa_dev->port[i].port_num = i + s; + spin_lock_init(&sa_dev->port[i].classport_lock); + sa_dev->port[i].classport_info.valid = false; + sa_dev->port[i].agent = ib_register_mad_agent(device, i + s, IB_QPT_GSI, NULL, 0, send_handler, diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 60df4f8e81be..15defefecb4f 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -38,6 +38,7 @@ #include <linux/stat.h> #include <linux/string.h> #include <linux/netdevice.h> +#include <linux/ethtool.h> #include <rdma/ib_mad.h> #include <rdma/ib_pma.h> @@ -1200,16 +1201,28 @@ static ssize_t set_node_desc(struct device *device, return count; } +static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + ib_get_device_fw_str(dev, buf, PAGE_SIZE); + strlcat(buf, "\n", PAGE_SIZE); + return strlen(buf); +} + static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL); static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL); static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL); static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static struct device_attribute *ib_class_attributes[] = { &dev_attr_node_type, &dev_attr_sys_image_guid, &dev_attr_node_guid, - &dev_attr_node_desc + &dev_attr_node_desc, + &dev_attr_fw_ver, }; static void free_port_list_attributes(struct ib_device *device) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index c0f3826abb30..2825ece91d3c 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -106,6 +106,7 @@ struct ucma_multicast { int events_reported; u64 uid; + u8 join_state; struct list_head list; struct sockaddr_storage addr; }; @@ -1317,12 +1318,20 @@ static ssize_t ucma_process_join(struct ucma_file *file, struct ucma_multicast *mc; struct sockaddr *addr; int ret; + u8 join_state; if (out_len < sizeof(resp)) return -ENOSPC; addr = (struct sockaddr *) &cmd->addr; - if (cmd->reserved || !cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr))) + if (!cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr))) + return -EINVAL; + + if (cmd->join_flags == RDMA_MC_JOIN_FLAG_FULLMEMBER) + join_state = BIT(FULLMEMBER_JOIN); + else if (cmd->join_flags == RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER) + join_state = BIT(SENDONLY_FULLMEMBER_JOIN); + else return -EINVAL; ctx = ucma_get_ctx(file, cmd->id); @@ -1335,10 +1344,11 @@ static ssize_t ucma_process_join(struct ucma_file *file, ret = -ENOMEM; goto err1; } - + mc->join_state = join_state; mc->uid = cmd->uid; memcpy(&mc->addr, addr, cmd->addr_size); - ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc); + ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr, + join_state, mc); if (ret) goto err2; @@ -1382,7 +1392,7 @@ static ssize_t ucma_join_ip_multicast(struct ucma_file *file, join_cmd.uid = cmd.uid; join_cmd.id = cmd.id; join_cmd.addr_size = rdma_addr_size((struct sockaddr *) &cmd.addr); - join_cmd.reserved = 0; + join_cmd.join_flags = RDMA_MC_JOIN_FLAG_FULLMEMBER; memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size); return ucma_process_join(file, &join_cmd, out_len); diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 612ccfd39bf9..df26a741cda6 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -116,6 +116,7 @@ struct ib_uverbs_event_file { struct ib_uverbs_file { struct kref ref; struct mutex mutex; + struct mutex cleanup_mutex; /* protect cleanup */ struct ib_uverbs_device *device; struct ib_ucontext *ucontext; struct ib_event_handler event_handler; @@ -162,6 +163,10 @@ struct ib_uqp_object { struct ib_uxrcd_object *uxrcd; }; +struct ib_uwq_object { + struct ib_uevent_object uevent; +}; + struct ib_ucq_object { struct ib_uobject uobject; struct ib_uverbs_file *uverbs_file; @@ -181,6 +186,8 @@ extern struct idr ib_uverbs_qp_idr; extern struct idr ib_uverbs_srq_idr; extern struct idr ib_uverbs_xrcd_idr; extern struct idr ib_uverbs_rule_idr; +extern struct idr ib_uverbs_wq_idr; +extern struct idr ib_uverbs_rwq_ind_tbl_idr; void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj); @@ -199,6 +206,7 @@ void ib_uverbs_release_uevent(struct ib_uverbs_file *file, void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context); void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr); +void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); @@ -219,6 +227,7 @@ struct ib_uverbs_flow_spec { struct ib_uverbs_flow_spec_eth eth; struct ib_uverbs_flow_spec_ipv4 ipv4; struct ib_uverbs_flow_spec_tcp_udp tcp_udp; + struct ib_uverbs_flow_spec_ipv6 ipv6; }; }; @@ -275,5 +284,10 @@ IB_UVERBS_DECLARE_EX_CMD(destroy_flow); IB_UVERBS_DECLARE_EX_CMD(query_device); IB_UVERBS_DECLARE_EX_CMD(create_cq); IB_UVERBS_DECLARE_EX_CMD(create_qp); +IB_UVERBS_DECLARE_EX_CMD(create_wq); +IB_UVERBS_DECLARE_EX_CMD(modify_wq); +IB_UVERBS_DECLARE_EX_CMD(destroy_wq); +IB_UVERBS_DECLARE_EX_CMD(create_rwq_ind_table); +IB_UVERBS_DECLARE_EX_CMD(destroy_rwq_ind_table); #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 825021d1008b..f6647318138d 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -57,6 +57,8 @@ static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" }; static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" }; static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; +static struct uverbs_lock_class wq_lock_class = { .name = "WQ-uobj" }; +static struct uverbs_lock_class rwq_ind_table_lock_class = { .name = "IND_TBL-uobj" }; /* * The ib_uobject locking scheme is as follows: @@ -243,6 +245,27 @@ static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context) return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0); } +static struct ib_wq *idr_read_wq(int wq_handle, struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_wq_idr, wq_handle, context, 0); +} + +static void put_wq_read(struct ib_wq *wq) +{ + put_uobj_read(wq->uobject); +} + +static struct ib_rwq_ind_table *idr_read_rwq_indirection_table(int ind_table_handle, + struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_rwq_ind_tbl_idr, ind_table_handle, context, 0); +} + +static void put_rwq_indirection_table_read(struct ib_rwq_ind_table *ind_table) +{ + put_uobj_read(ind_table->uobject); +} + static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context) { struct ib_uobject *uobj; @@ -326,6 +349,8 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, INIT_LIST_HEAD(&ucontext->qp_list); INIT_LIST_HEAD(&ucontext->srq_list); INIT_LIST_HEAD(&ucontext->ah_list); + INIT_LIST_HEAD(&ucontext->wq_list); + INIT_LIST_HEAD(&ucontext->rwq_ind_tbl_list); INIT_LIST_HEAD(&ucontext->xrcd_list); INIT_LIST_HEAD(&ucontext->rule_list); rcu_read_lock(); @@ -1750,6 +1775,8 @@ static int create_qp(struct ib_uverbs_file *file, struct ib_qp_init_attr attr = {}; struct ib_uverbs_ex_create_qp_resp resp; int ret; + struct ib_rwq_ind_table *ind_tbl = NULL; + bool has_sq = true; if (cmd->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) return -EPERM; @@ -1761,6 +1788,32 @@ static int create_qp(struct ib_uverbs_file *file, init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &qp_lock_class); down_write(&obj->uevent.uobject.mutex); + if (cmd_sz >= offsetof(typeof(*cmd), rwq_ind_tbl_handle) + + sizeof(cmd->rwq_ind_tbl_handle) && + (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE)) { + ind_tbl = idr_read_rwq_indirection_table(cmd->rwq_ind_tbl_handle, + file->ucontext); + if (!ind_tbl) { + ret = -EINVAL; + goto err_put; + } + + attr.rwq_ind_tbl = ind_tbl; + } + + if ((cmd_sz >= offsetof(typeof(*cmd), reserved1) + + sizeof(cmd->reserved1)) && cmd->reserved1) { + ret = -EOPNOTSUPP; + goto err_put; + } + + if (ind_tbl && (cmd->max_recv_wr || cmd->max_recv_sge || cmd->is_srq)) { + ret = -EINVAL; + goto err_put; + } + + if (ind_tbl && !cmd->max_send_wr) + has_sq = false; if (cmd->qp_type == IB_QPT_XRC_TGT) { xrcd = idr_read_xrcd(cmd->pd_handle, file->ucontext, @@ -1784,20 +1837,24 @@ static int create_qp(struct ib_uverbs_file *file, } } - if (cmd->recv_cq_handle != cmd->send_cq_handle) { - rcq = idr_read_cq(cmd->recv_cq_handle, - file->ucontext, 0); - if (!rcq) { - ret = -EINVAL; - goto err_put; + if (!ind_tbl) { + if (cmd->recv_cq_handle != cmd->send_cq_handle) { + rcq = idr_read_cq(cmd->recv_cq_handle, + file->ucontext, 0); + if (!rcq) { + ret = -EINVAL; + goto err_put; + } } } } - scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq); - rcq = rcq ?: scq; + if (has_sq) + scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq); + if (!ind_tbl) + rcq = rcq ?: scq; pd = idr_read_pd(cmd->pd_handle, file->ucontext); - if (!pd || !scq) { + if (!pd || (!scq && has_sq)) { ret = -EINVAL; goto err_put; } @@ -1864,16 +1921,20 @@ static int create_qp(struct ib_uverbs_file *file, qp->send_cq = attr.send_cq; qp->recv_cq = attr.recv_cq; qp->srq = attr.srq; + qp->rwq_ind_tbl = ind_tbl; qp->event_handler = attr.event_handler; qp->qp_context = attr.qp_context; qp->qp_type = attr.qp_type; atomic_set(&qp->usecnt, 0); atomic_inc(&pd->usecnt); - atomic_inc(&attr.send_cq->usecnt); + if (attr.send_cq) + atomic_inc(&attr.send_cq->usecnt); if (attr.recv_cq) atomic_inc(&attr.recv_cq->usecnt); if (attr.srq) atomic_inc(&attr.srq->usecnt); + if (ind_tbl) + atomic_inc(&ind_tbl->usecnt); } qp->uobject = &obj->uevent.uobject; @@ -1913,6 +1974,8 @@ static int create_qp(struct ib_uverbs_file *file, put_cq_read(rcq); if (srq) put_srq_read(srq); + if (ind_tbl) + put_rwq_indirection_table_read(ind_tbl); mutex_lock(&file->mutex); list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); @@ -1940,6 +2003,8 @@ err_put: put_cq_read(rcq); if (srq) put_srq_read(srq); + if (ind_tbl) + put_rwq_indirection_table_read(ind_tbl); put_uobj_write(&obj->uevent.uobject); return ret; @@ -2033,7 +2098,7 @@ int ib_uverbs_ex_create_qp(struct ib_uverbs_file *file, if (err) return err; - if (cmd.comp_mask) + if (cmd.comp_mask & ~IB_UVERBS_CREATE_QP_SUP_COMP_MASK) return -EINVAL; if (cmd.reserved) @@ -3040,6 +3105,15 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask, sizeof(struct ib_flow_ipv4_filter)); break; + case IB_FLOW_SPEC_IPV6: + ib_spec->ipv6.size = sizeof(struct ib_flow_spec_ipv6); + if (ib_spec->ipv6.size != kern_spec->ipv6.size) + return -EINVAL; + memcpy(&ib_spec->ipv6.val, &kern_spec->ipv6.val, + sizeof(struct ib_flow_ipv6_filter)); + memcpy(&ib_spec->ipv6.mask, &kern_spec->ipv6.mask, + sizeof(struct ib_flow_ipv6_filter)); + break; case IB_FLOW_SPEC_TCP: case IB_FLOW_SPEC_UDP: ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp); @@ -3056,6 +3130,445 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, return 0; } +int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_create_wq cmd = {}; + struct ib_uverbs_ex_create_wq_resp resp = {}; + struct ib_uwq_object *obj; + int err = 0; + struct ib_cq *cq; + struct ib_pd *pd; + struct ib_wq *wq; + struct ib_wq_init_attr wq_init_attr = {}; + size_t required_cmd_sz; + size_t required_resp_len; + + required_cmd_sz = offsetof(typeof(cmd), max_sge) + sizeof(cmd.max_sge); + required_resp_len = offsetof(typeof(resp), wqn) + sizeof(resp.wqn); + + if (ucore->inlen < required_cmd_sz) + return -EINVAL; + + if (ucore->outlen < required_resp_len) + return -ENOSPC; + + if (ucore->inlen > sizeof(cmd) && + !ib_is_udata_cleared(ucore, sizeof(cmd), + ucore->inlen - sizeof(cmd))) + return -EOPNOTSUPP; + + err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); + if (err) + return err; + + if (cmd.comp_mask) + return -EOPNOTSUPP; + + obj = kmalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, + &wq_lock_class); + down_write(&obj->uevent.uobject.mutex); + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + err = -EINVAL; + goto err_uobj; + } + + cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); + if (!cq) { + err = -EINVAL; + goto err_put_pd; + } + + wq_init_attr.cq = cq; + wq_init_attr.max_sge = cmd.max_sge; + wq_init_attr.max_wr = cmd.max_wr; + wq_init_attr.wq_context = file; + wq_init_attr.wq_type = cmd.wq_type; + wq_init_attr.event_handler = ib_uverbs_wq_event_handler; + obj->uevent.events_reported = 0; + INIT_LIST_HEAD(&obj->uevent.event_list); + wq = pd->device->create_wq(pd, &wq_init_attr, uhw); + if (IS_ERR(wq)) { + err = PTR_ERR(wq); + goto err_put_cq; + } + + wq->uobject = &obj->uevent.uobject; + obj->uevent.uobject.object = wq; + wq->wq_type = wq_init_attr.wq_type; + wq->cq = cq; + wq->pd = pd; + wq->device = pd->device; + wq->wq_context = wq_init_attr.wq_context; + atomic_set(&wq->usecnt, 0); + atomic_inc(&pd->usecnt); + atomic_inc(&cq->usecnt); + wq->uobject = &obj->uevent.uobject; + obj->uevent.uobject.object = wq; + err = idr_add_uobj(&ib_uverbs_wq_idr, &obj->uevent.uobject); + if (err) + goto destroy_wq; + + memset(&resp, 0, sizeof(resp)); + resp.wq_handle = obj->uevent.uobject.id; + resp.max_sge = wq_init_attr.max_sge; + resp.max_wr = wq_init_attr.max_wr; + resp.wqn = wq->wq_num; + resp.response_length = required_resp_len; + err = ib_copy_to_udata(ucore, + &resp, resp.response_length); + if (err) + goto err_copy; + + put_pd_read(pd); + put_cq_read(cq); + + mutex_lock(&file->mutex); + list_add_tail(&obj->uevent.uobject.list, &file->ucontext->wq_list); + mutex_unlock(&file->mutex); + + obj->uevent.uobject.live = 1; + up_write(&obj->uevent.uobject.mutex); + return 0; + +err_copy: + idr_remove_uobj(&ib_uverbs_wq_idr, &obj->uevent.uobject); +destroy_wq: + ib_destroy_wq(wq); +err_put_cq: + put_cq_read(cq); +err_put_pd: + put_pd_read(pd); +err_uobj: + put_uobj_write(&obj->uevent.uobject); + + return err; +} + +int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_destroy_wq cmd = {}; + struct ib_uverbs_ex_destroy_wq_resp resp = {}; + struct ib_wq *wq; + struct ib_uobject *uobj; + struct ib_uwq_object *obj; + size_t required_cmd_sz; + size_t required_resp_len; + int ret; + + required_cmd_sz = offsetof(typeof(cmd), wq_handle) + sizeof(cmd.wq_handle); + required_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved); + + if (ucore->inlen < required_cmd_sz) + return -EINVAL; + + if (ucore->outlen < required_resp_len) + return -ENOSPC; + + if (ucore->inlen > sizeof(cmd) && + !ib_is_udata_cleared(ucore, sizeof(cmd), + ucore->inlen - sizeof(cmd))) + return -EOPNOTSUPP; + + ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); + if (ret) + return ret; + + if (cmd.comp_mask) + return -EOPNOTSUPP; + + resp.response_length = required_resp_len; + uobj = idr_write_uobj(&ib_uverbs_wq_idr, cmd.wq_handle, + file->ucontext); + if (!uobj) + return -EINVAL; + + wq = uobj->object; + obj = container_of(uobj, struct ib_uwq_object, uevent.uobject); + ret = ib_destroy_wq(wq); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + if (ret) + return ret; + + idr_remove_uobj(&ib_uverbs_wq_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + ib_uverbs_release_uevent(file, &obj->uevent); + resp.events_reported = obj->uevent.events_reported; + put_uobj(uobj); + + ret = ib_copy_to_udata(ucore, &resp, resp.response_length); + if (ret) + return ret; + + return 0; +} + +int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_modify_wq cmd = {}; + struct ib_wq *wq; + struct ib_wq_attr wq_attr = {}; + size_t required_cmd_sz; + int ret; + + required_cmd_sz = offsetof(typeof(cmd), curr_wq_state) + sizeof(cmd.curr_wq_state); + if (ucore->inlen < required_cmd_sz) + return -EINVAL; + + if (ucore->inlen > sizeof(cmd) && + !ib_is_udata_cleared(ucore, sizeof(cmd), + ucore->inlen - sizeof(cmd))) + return -EOPNOTSUPP; + + ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); + if (ret) + return ret; + + if (!cmd.attr_mask) + return -EINVAL; + + if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE)) + return -EINVAL; + + wq = idr_read_wq(cmd.wq_handle, file->ucontext); + if (!wq) + return -EINVAL; + + wq_attr.curr_wq_state = cmd.curr_wq_state; + wq_attr.wq_state = cmd.wq_state; + ret = wq->device->modify_wq(wq, &wq_attr, cmd.attr_mask, uhw); + put_wq_read(wq); + return ret; +} + +int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_create_rwq_ind_table cmd = {}; + struct ib_uverbs_ex_create_rwq_ind_table_resp resp = {}; + struct ib_uobject *uobj; + int err = 0; + struct ib_rwq_ind_table_init_attr init_attr = {}; + struct ib_rwq_ind_table *rwq_ind_tbl; + struct ib_wq **wqs = NULL; + u32 *wqs_handles = NULL; + struct ib_wq *wq = NULL; + int i, j, num_read_wqs; + u32 num_wq_handles; + u32 expected_in_size; + size_t required_cmd_sz_header; + size_t required_resp_len; + + required_cmd_sz_header = offsetof(typeof(cmd), log_ind_tbl_size) + sizeof(cmd.log_ind_tbl_size); + required_resp_len = offsetof(typeof(resp), ind_tbl_num) + sizeof(resp.ind_tbl_num); + + if (ucore->inlen < required_cmd_sz_header) + return -EINVAL; + + if (ucore->outlen < required_resp_len) + return -ENOSPC; + + err = ib_copy_from_udata(&cmd, ucore, required_cmd_sz_header); + if (err) + return err; + + ucore->inbuf += required_cmd_sz_header; + ucore->inlen -= required_cmd_sz_header; + + if (cmd.comp_mask) + return -EOPNOTSUPP; + + if (cmd.log_ind_tbl_size > IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE) + return -EINVAL; + + num_wq_handles = 1 << cmd.log_ind_tbl_size; + expected_in_size = num_wq_handles * sizeof(__u32); + if (num_wq_handles == 1) + /* input size for wq handles is u64 aligned */ + expected_in_size += sizeof(__u32); + + if (ucore->inlen < expected_in_size) + return -EINVAL; + + if (ucore->inlen > expected_in_size && + !ib_is_udata_cleared(ucore, expected_in_size, + ucore->inlen - expected_in_size)) + return -EOPNOTSUPP; + + wqs_handles = kcalloc(num_wq_handles, sizeof(*wqs_handles), + GFP_KERNEL); + if (!wqs_handles) + return -ENOMEM; + + err = ib_copy_from_udata(wqs_handles, ucore, + num_wq_handles * sizeof(__u32)); + if (err) + goto err_free; + + wqs = kcalloc(num_wq_handles, sizeof(*wqs), GFP_KERNEL); + if (!wqs) { + err = -ENOMEM; + goto err_free; + } + + for (num_read_wqs = 0; num_read_wqs < num_wq_handles; + num_read_wqs++) { + wq = idr_read_wq(wqs_handles[num_read_wqs], file->ucontext); + if (!wq) { + err = -EINVAL; + goto put_wqs; + } + + wqs[num_read_wqs] = wq; + } + + uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) { + err = -ENOMEM; + goto put_wqs; + } + + init_uobj(uobj, 0, file->ucontext, &rwq_ind_table_lock_class); + down_write(&uobj->mutex); + init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size; + init_attr.ind_tbl = wqs; + rwq_ind_tbl = ib_dev->create_rwq_ind_table(ib_dev, &init_attr, uhw); + + if (IS_ERR(rwq_ind_tbl)) { + err = PTR_ERR(rwq_ind_tbl); + goto err_uobj; + } + + rwq_ind_tbl->ind_tbl = wqs; + rwq_ind_tbl->log_ind_tbl_size = init_attr.log_ind_tbl_size; + rwq_ind_tbl->uobject = uobj; + uobj->object = rwq_ind_tbl; + rwq_ind_tbl->device = ib_dev; + atomic_set(&rwq_ind_tbl->usecnt, 0); + + for (i = 0; i < num_wq_handles; i++) + atomic_inc(&wqs[i]->usecnt); + + err = idr_add_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); + if (err) + goto destroy_ind_tbl; + + resp.ind_tbl_handle = uobj->id; + resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num; + resp.response_length = required_resp_len; + + err = ib_copy_to_udata(ucore, + &resp, resp.response_length); + if (err) + goto err_copy; + + kfree(wqs_handles); + + for (j = 0; j < num_read_wqs; j++) + put_wq_read(wqs[j]); + + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->rwq_ind_tbl_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + + up_write(&uobj->mutex); + return 0; + +err_copy: + idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); +destroy_ind_tbl: + ib_destroy_rwq_ind_table(rwq_ind_tbl); +err_uobj: + put_uobj_write(uobj); +put_wqs: + for (j = 0; j < num_read_wqs; j++) + put_wq_read(wqs[j]); +err_free: + kfree(wqs_handles); + kfree(wqs); + return err; +} + +int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_destroy_rwq_ind_table cmd = {}; + struct ib_rwq_ind_table *rwq_ind_tbl; + struct ib_uobject *uobj; + int ret; + struct ib_wq **ind_tbl; + size_t required_cmd_sz; + + required_cmd_sz = offsetof(typeof(cmd), ind_tbl_handle) + sizeof(cmd.ind_tbl_handle); + + if (ucore->inlen < required_cmd_sz) + return -EINVAL; + + if (ucore->inlen > sizeof(cmd) && + !ib_is_udata_cleared(ucore, sizeof(cmd), + ucore->inlen - sizeof(cmd))) + return -EOPNOTSUPP; + + ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); + if (ret) + return ret; + + if (cmd.comp_mask) + return -EOPNOTSUPP; + + uobj = idr_write_uobj(&ib_uverbs_rwq_ind_tbl_idr, cmd.ind_tbl_handle, + file->ucontext); + if (!uobj) + return -EINVAL; + rwq_ind_tbl = uobj->object; + ind_tbl = rwq_ind_tbl->ind_tbl; + + ret = ib_destroy_rwq_ind_table(rwq_ind_tbl); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + if (ret) + return ret; + + idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + kfree(ind_tbl); + return ret; +} + int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, struct ib_device *ib_dev, struct ib_udata *ucore, diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 31f422a70623..0012fa58c105 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -76,6 +76,8 @@ DEFINE_IDR(ib_uverbs_qp_idr); DEFINE_IDR(ib_uverbs_srq_idr); DEFINE_IDR(ib_uverbs_xrcd_idr); DEFINE_IDR(ib_uverbs_rule_idr); +DEFINE_IDR(ib_uverbs_wq_idr); +DEFINE_IDR(ib_uverbs_rwq_ind_tbl_idr); static DEFINE_SPINLOCK(map_lock); static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); @@ -130,6 +132,11 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device, [IB_USER_VERBS_EX_CMD_CREATE_CQ] = ib_uverbs_ex_create_cq, [IB_USER_VERBS_EX_CMD_CREATE_QP] = ib_uverbs_ex_create_qp, + [IB_USER_VERBS_EX_CMD_CREATE_WQ] = ib_uverbs_ex_create_wq, + [IB_USER_VERBS_EX_CMD_MODIFY_WQ] = ib_uverbs_ex_modify_wq, + [IB_USER_VERBS_EX_CMD_DESTROY_WQ] = ib_uverbs_ex_destroy_wq, + [IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL] = ib_uverbs_ex_create_rwq_ind_table, + [IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL] = ib_uverbs_ex_destroy_rwq_ind_table, }; static void ib_uverbs_add_one(struct ib_device *device); @@ -265,6 +272,27 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, kfree(uqp); } + list_for_each_entry_safe(uobj, tmp, &context->rwq_ind_tbl_list, list) { + struct ib_rwq_ind_table *rwq_ind_tbl = uobj->object; + struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl; + + idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); + ib_destroy_rwq_ind_table(rwq_ind_tbl); + kfree(ind_tbl); + kfree(uobj); + } + + list_for_each_entry_safe(uobj, tmp, &context->wq_list, list) { + struct ib_wq *wq = uobj->object; + struct ib_uwq_object *uwq = + container_of(uobj, struct ib_uwq_object, uevent.uobject); + + idr_remove_uobj(&ib_uverbs_wq_idr, uobj); + ib_destroy_wq(wq); + ib_uverbs_release_uevent(file, &uwq->uevent); + kfree(uwq); + } + list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) { struct ib_srq *srq = uobj->object; struct ib_uevent_object *uevent = @@ -568,6 +596,16 @@ void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr) &uobj->events_reported); } +void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr) +{ + struct ib_uevent_object *uobj = container_of(event->element.wq->uobject, + struct ib_uevent_object, uobject); + + ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle, + event->event, &uobj->event_list, + &uobj->events_reported); +} + void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr) { struct ib_uevent_object *uobj; @@ -931,6 +969,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) file->async_file = NULL; kref_init(&file->ref); mutex_init(&file->mutex); + mutex_init(&file->cleanup_mutex); filp->private_data = file; kobject_get(&dev->kobj); @@ -956,18 +995,20 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) { struct ib_uverbs_file *file = filp->private_data; struct ib_uverbs_device *dev = file->device; - struct ib_ucontext *ucontext = NULL; + + mutex_lock(&file->cleanup_mutex); + if (file->ucontext) { + ib_uverbs_cleanup_ucontext(file, file->ucontext); + file->ucontext = NULL; + } + mutex_unlock(&file->cleanup_mutex); mutex_lock(&file->device->lists_mutex); - ucontext = file->ucontext; - file->ucontext = NULL; if (!file->is_closed) { list_del(&file->list); file->is_closed = 1; } mutex_unlock(&file->device->lists_mutex); - if (ucontext) - ib_uverbs_cleanup_ucontext(file, ucontext); if (file->async_file) kref_put(&file->async_file->ref, ib_uverbs_release_event_file); @@ -1181,22 +1222,30 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, mutex_lock(&uverbs_dev->lists_mutex); while (!list_empty(&uverbs_dev->uverbs_file_list)) { struct ib_ucontext *ucontext; - file = list_first_entry(&uverbs_dev->uverbs_file_list, struct ib_uverbs_file, list); file->is_closed = 1; - ucontext = file->ucontext; list_del(&file->list); - file->ucontext = NULL; kref_get(&file->ref); mutex_unlock(&uverbs_dev->lists_mutex); - /* We must release the mutex before going ahead and calling - * disassociate_ucontext. disassociate_ucontext might end up - * indirectly calling uverbs_close, for example due to freeing - * the resources (e.g mmput). - */ + ib_uverbs_event_handler(&file->event_handler, &event); + + mutex_lock(&file->cleanup_mutex); + ucontext = file->ucontext; + file->ucontext = NULL; + mutex_unlock(&file->cleanup_mutex); + + /* At this point ib_uverbs_close cannot be running + * ib_uverbs_cleanup_ucontext + */ if (ucontext) { + /* We must release the mutex before going ahead and + * calling disassociate_ucontext. disassociate_ucontext + * might end up indirectly calling uverbs_close, + * for example due to freeing the resources + * (e.g mmput). + */ ib_dev->disassociate_ucontext(ucontext); ib_uverbs_cleanup_ucontext(file, ucontext); } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 6298f54b4137..f2b776efab3a 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -758,6 +758,12 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp *qp; int ret; + if (qp_init_attr->rwq_ind_tbl && + (qp_init_attr->recv_cq || + qp_init_attr->srq || qp_init_attr->cap.max_recv_wr || + qp_init_attr->cap.max_recv_sge)) + return ERR_PTR(-EINVAL); + /* * If the callers is using the RDMA API calculate the resources * needed for the RDMA READ/WRITE operations. @@ -775,6 +781,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, qp->real_qp = qp; qp->uobject = NULL; qp->qp_type = qp_init_attr->qp_type; + qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl; atomic_set(&qp->usecnt, 0); qp->mrs_used = 0; @@ -792,7 +799,8 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, qp->srq = NULL; } else { qp->recv_cq = qp_init_attr->recv_cq; - atomic_inc(&qp_init_attr->recv_cq->usecnt); + if (qp_init_attr->recv_cq) + atomic_inc(&qp_init_attr->recv_cq->usecnt); qp->srq = qp_init_attr->srq; if (qp->srq) atomic_inc(&qp_init_attr->srq->usecnt); @@ -803,7 +811,10 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, qp->xrcd = NULL; atomic_inc(&pd->usecnt); - atomic_inc(&qp_init_attr->send_cq->usecnt); + if (qp_init_attr->send_cq) + atomic_inc(&qp_init_attr->send_cq->usecnt); + if (qp_init_attr->rwq_ind_tbl) + atomic_inc(&qp->rwq_ind_tbl->usecnt); if (qp_init_attr->cap.max_rdma_ctxs) { ret = rdma_rw_init_mrs(qp, qp_init_attr); @@ -814,6 +825,15 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, } } + /* + * Note: all hw drivers guarantee that max_send_sge is lower than + * the device RDMA WRITE SGE limit but not all hw drivers ensure that + * max_send_sge <= max_sge_rd. + */ + qp->max_write_sge = qp_init_attr->cap.max_send_sge; + qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge, + device->attrs.max_sge_rd); + return qp; } EXPORT_SYMBOL(ib_create_qp); @@ -1283,6 +1303,7 @@ int ib_destroy_qp(struct ib_qp *qp) struct ib_pd *pd; struct ib_cq *scq, *rcq; struct ib_srq *srq; + struct ib_rwq_ind_table *ind_tbl; int ret; WARN_ON_ONCE(qp->mrs_used > 0); @@ -1297,6 +1318,7 @@ int ib_destroy_qp(struct ib_qp *qp) scq = qp->send_cq; rcq = qp->recv_cq; srq = qp->srq; + ind_tbl = qp->rwq_ind_tbl; if (!qp->uobject) rdma_rw_cleanup_mrs(qp); @@ -1311,6 +1333,8 @@ int ib_destroy_qp(struct ib_qp *qp) atomic_dec(&rcq->usecnt); if (srq) atomic_dec(&srq->usecnt); + if (ind_tbl) + atomic_dec(&ind_tbl->usecnt); } return ret; @@ -1558,6 +1582,150 @@ int ib_dealloc_xrcd(struct ib_xrcd *xrcd) } EXPORT_SYMBOL(ib_dealloc_xrcd); +/** + * ib_create_wq - Creates a WQ associated with the specified protection + * domain. + * @pd: The protection domain associated with the WQ. + * @wq_init_attr: A list of initial attributes required to create the + * WQ. If WQ creation succeeds, then the attributes are updated to + * the actual capabilities of the created WQ. + * + * wq_init_attr->max_wr and wq_init_attr->max_sge determine + * the requested size of the WQ, and set to the actual values allocated + * on return. + * If ib_create_wq() succeeds, then max_wr and max_sge will always be + * at least as large as the requested values. + */ +struct ib_wq *ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *wq_attr) +{ + struct ib_wq *wq; + + if (!pd->device->create_wq) + return ERR_PTR(-ENOSYS); + + wq = pd->device->create_wq(pd, wq_attr, NULL); + if (!IS_ERR(wq)) { + wq->event_handler = wq_attr->event_handler; + wq->wq_context = wq_attr->wq_context; + wq->wq_type = wq_attr->wq_type; + wq->cq = wq_attr->cq; + wq->device = pd->device; + wq->pd = pd; + wq->uobject = NULL; + atomic_inc(&pd->usecnt); + atomic_inc(&wq_attr->cq->usecnt); + atomic_set(&wq->usecnt, 0); + } + return wq; +} +EXPORT_SYMBOL(ib_create_wq); + +/** + * ib_destroy_wq - Destroys the specified WQ. + * @wq: The WQ to destroy. + */ +int ib_destroy_wq(struct ib_wq *wq) +{ + int err; + struct ib_cq *cq = wq->cq; + struct ib_pd *pd = wq->pd; + + if (atomic_read(&wq->usecnt)) + return -EBUSY; + + err = wq->device->destroy_wq(wq); + if (!err) { + atomic_dec(&pd->usecnt); + atomic_dec(&cq->usecnt); + } + return err; +} +EXPORT_SYMBOL(ib_destroy_wq); + +/** + * ib_modify_wq - Modifies the specified WQ. + * @wq: The WQ to modify. + * @wq_attr: On input, specifies the WQ attributes to modify. + * @wq_attr_mask: A bit-mask used to specify which attributes of the WQ + * are being modified. + * On output, the current values of selected WQ attributes are returned. + */ +int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask) +{ + int err; + + if (!wq->device->modify_wq) + return -ENOSYS; + + err = wq->device->modify_wq(wq, wq_attr, wq_attr_mask, NULL); + return err; +} +EXPORT_SYMBOL(ib_modify_wq); + +/* + * ib_create_rwq_ind_table - Creates a RQ Indirection Table. + * @device: The device on which to create the rwq indirection table. + * @ib_rwq_ind_table_init_attr: A list of initial attributes required to + * create the Indirection Table. + * + * Note: The life time of ib_rwq_ind_table_init_attr->ind_tbl is not less + * than the created ib_rwq_ind_table object and the caller is responsible + * for its memory allocation/free. + */ +struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr) +{ + struct ib_rwq_ind_table *rwq_ind_table; + int i; + u32 table_size; + + if (!device->create_rwq_ind_table) + return ERR_PTR(-ENOSYS); + + table_size = (1 << init_attr->log_ind_tbl_size); + rwq_ind_table = device->create_rwq_ind_table(device, + init_attr, NULL); + if (IS_ERR(rwq_ind_table)) + return rwq_ind_table; + + rwq_ind_table->ind_tbl = init_attr->ind_tbl; + rwq_ind_table->log_ind_tbl_size = init_attr->log_ind_tbl_size; + rwq_ind_table->device = device; + rwq_ind_table->uobject = NULL; + atomic_set(&rwq_ind_table->usecnt, 0); + + for (i = 0; i < table_size; i++) + atomic_inc(&rwq_ind_table->ind_tbl[i]->usecnt); + + return rwq_ind_table; +} +EXPORT_SYMBOL(ib_create_rwq_ind_table); + +/* + * ib_destroy_rwq_ind_table - Destroys the specified Indirection Table. + * @wq_ind_table: The Indirection Table to destroy. +*/ +int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table) +{ + int err, i; + u32 table_size = (1 << rwq_ind_table->log_ind_tbl_size); + struct ib_wq **ind_tbl = rwq_ind_table->ind_tbl; + + if (atomic_read(&rwq_ind_table->usecnt)) + return -EBUSY; + + err = rwq_ind_table->device->destroy_rwq_ind_table(rwq_ind_table); + if (!err) { + for (i = 0; i < table_size; i++) + atomic_dec(&ind_tbl[i]->usecnt); + } + + return err; +} +EXPORT_SYMBOL(ib_destroy_rwq_ind_table); + struct ib_flow *ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain) diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index 3e8431b5cad7..04bbf172abde 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -1396,10 +1396,10 @@ static int pass_accept_req(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) state_set(&child_ep->com, CONNECTING); child_ep->com.tdev = tdev; child_ep->com.cm_id = NULL; - child_ep->com.local_addr.sin_family = PF_INET; + child_ep->com.local_addr.sin_family = AF_INET; child_ep->com.local_addr.sin_port = req->local_port; child_ep->com.local_addr.sin_addr.s_addr = req->local_ip; - child_ep->com.remote_addr.sin_family = PF_INET; + child_ep->com.remote_addr.sin_family = AF_INET; child_ep->com.remote_addr.sin_port = req->peer_port; child_ep->com.remote_addr.sin_addr.s_addr = req->peer_ip; get_ep(&parent_ep->com); diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index bb1a839d4d6d..3edb80644b53 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1183,18 +1183,6 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr, return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type); } -static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, - ibdev.dev); - struct ethtool_drvinfo info; - struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev; - - PDBG("%s dev 0x%p\n", __func__, dev); - lldev->ethtool_ops->get_drvinfo(lldev, &info); - return sprintf(buf, "%s\n", info.fw_version); -} - static ssize_t show_hca(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1334,13 +1322,11 @@ static int iwch_get_mib(struct ib_device *ibdev, struct rdma_hw_stats *stats, } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static struct device_attribute *iwch_class_attributes[] = { &dev_attr_hw_rev, - &dev_attr_fw_ver, &dev_attr_hca_type, &dev_attr_board_id, }; @@ -1362,6 +1348,18 @@ static int iwch_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } +static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str, + size_t str_len) +{ + struct iwch_dev *iwch_dev = to_iwch_dev(ibdev); + struct ethtool_drvinfo info; + struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev; + + PDBG("%s dev 0x%p\n", __func__, iwch_dev); + lldev->ethtool_ops->get_drvinfo(lldev, &info); + snprintf(str, str_len, "%s", info.fw_version); +} + int iwch_register_device(struct iwch_dev *dev) { int ret; @@ -1437,6 +1435,7 @@ int iwch_register_device(struct iwch_dev *dev) dev->ibdev.get_hw_stats = iwch_get_mib; dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION; dev->ibdev.get_port_immutable = iwch_port_immutable; + dev->ibdev.get_dev_fw_str = get_dev_fw_ver_str; dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL); if (!dev->ibdev.iwcm) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index a3a67216bce6..3aca7f6171b4 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -294,6 +294,25 @@ static void state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new) return; } +static int alloc_ep_skb_list(struct sk_buff_head *ep_skb_list, int size) +{ + struct sk_buff *skb; + unsigned int i; + size_t len; + + len = roundup(sizeof(union cpl_wr_size), 16); + for (i = 0; i < size; i++) { + skb = alloc_skb(len, GFP_KERNEL); + if (!skb) + goto fail; + skb_queue_tail(ep_skb_list, skb); + } + return 0; +fail: + skb_queue_purge(ep_skb_list); + return -ENOMEM; +} + static void *alloc_ep(int size, gfp_t gfp) { struct c4iw_ep_common *epc; @@ -384,6 +403,8 @@ void _c4iw_free_ep(struct kref *kref) if (ep->mpa_skb) kfree_skb(ep->mpa_skb); } + if (!skb_queue_empty(&ep->com.ep_skb_list)) + skb_queue_purge(&ep->com.ep_skb_list); kfree(ep); } @@ -620,25 +641,27 @@ static void abort_arp_failure(void *handle, struct sk_buff *skb) } } -static int send_flowc(struct c4iw_ep *ep, struct sk_buff *skb) +static int send_flowc(struct c4iw_ep *ep) { - unsigned int flowclen = 80; struct fw_flowc_wr *flowc; + struct sk_buff *skb = skb_dequeue(&ep->com.ep_skb_list); int i; u16 vlan = ep->l2t->vlan; int nparams; + if (WARN_ON(!skb)) + return -ENOMEM; + if (vlan == CPL_L2T_VLAN_NONE) nparams = 8; else nparams = 9; - skb = get_skb(skb, flowclen, GFP_KERNEL); - flowc = (struct fw_flowc_wr *)__skb_put(skb, flowclen); + flowc = (struct fw_flowc_wr *)__skb_put(skb, FLOWC_LEN); flowc->op_to_nparams = cpu_to_be32(FW_WR_OP_V(FW_FLOWC_WR) | FW_FLOWC_WR_NPARAMS_V(nparams)); - flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(flowclen, + flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(FLOWC_LEN, 16)) | FW_WR_FLOWID_V(ep->hwtid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; @@ -679,18 +702,16 @@ static int send_flowc(struct c4iw_ep *ep, struct sk_buff *skb) return c4iw_ofld_send(&ep->com.dev->rdev, skb); } -static int send_halfclose(struct c4iw_ep *ep, gfp_t gfp) +static int send_halfclose(struct c4iw_ep *ep) { struct cpl_close_con_req *req; - struct sk_buff *skb; + struct sk_buff *skb = skb_dequeue(&ep->com.ep_skb_list); int wrlen = roundup(sizeof *req, 16); PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); - skb = get_skb(NULL, wrlen, gfp); - if (!skb) { - printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__); + if (WARN_ON(!skb)) return -ENOMEM; - } + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); t4_set_arp_err_handler(skb, NULL, arp_failure_discard); req = (struct cpl_close_con_req *) skb_put(skb, wrlen); @@ -701,26 +722,24 @@ static int send_halfclose(struct c4iw_ep *ep, gfp_t gfp) return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); } -static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp) +static int send_abort(struct c4iw_ep *ep) { struct cpl_abort_req *req; int wrlen = roundup(sizeof *req, 16); + struct sk_buff *req_skb = skb_dequeue(&ep->com.ep_skb_list); PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); - skb = get_skb(skb, wrlen, gfp); - if (!skb) { - printk(KERN_ERR MOD "%s - failed to alloc skb.\n", - __func__); + if (WARN_ON(!req_skb)) return -ENOMEM; - } - set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); - t4_set_arp_err_handler(skb, ep, abort_arp_failure); - req = (struct cpl_abort_req *) skb_put(skb, wrlen); + + set_wr_txq(req_skb, CPL_PRIORITY_DATA, ep->txq_idx); + t4_set_arp_err_handler(req_skb, ep, abort_arp_failure); + req = (struct cpl_abort_req *)skb_put(req_skb, wrlen); memset(req, 0, wrlen); INIT_TP_WR(req, ep->hwtid); OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid)); req->cmd = CPL_ABORT_SEND_RST; - return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); + return c4iw_l2t_send(&ep->com.dev->rdev, req_skb, ep->l2t); } static void best_mtu(const unsigned short *mtus, unsigned short mtu, @@ -992,9 +1011,19 @@ static int send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb, mpa = (struct mpa_message *)(req + 1); memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)); - mpa->flags = (crc_enabled ? MPA_CRC : 0) | - (markers_enabled ? MPA_MARKERS : 0) | - (mpa_rev_to_use == 2 ? MPA_ENHANCED_RDMA_CONN : 0); + + mpa->flags = 0; + if (crc_enabled) + mpa->flags |= MPA_CRC; + if (markers_enabled) { + mpa->flags |= MPA_MARKERS; + ep->mpa_attr.recv_marker_enabled = 1; + } else { + ep->mpa_attr.recv_marker_enabled = 0; + } + if (mpa_rev_to_use == 2) + mpa->flags |= MPA_ENHANCED_RDMA_CONN; + mpa->private_data_size = htons(ep->plen); mpa->revision = mpa_rev_to_use; if (mpa_rev_to_use == 1) { @@ -1169,8 +1198,11 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen) mpa = (struct mpa_message *)(req + 1); memset(mpa, 0, sizeof(*mpa)); memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); - mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) | - (markers_enabled ? MPA_MARKERS : 0); + mpa->flags = 0; + if (ep->mpa_attr.crc_enabled) + mpa->flags |= MPA_CRC; + if (ep->mpa_attr.recv_marker_enabled) + mpa->flags |= MPA_MARKERS; mpa->revision = ep->mpa_attr.version; mpa->private_data_size = htons(plen); @@ -1248,7 +1280,7 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) set_bit(ACT_ESTAB, &ep->com.history); /* start MPA negotiation */ - ret = send_flowc(ep, NULL); + ret = send_flowc(ep); if (ret) goto err; if (ep->retry_with_mpa_v1) @@ -1555,7 +1587,6 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) */ __state_set(&ep->com, FPDU_MODE); ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; - ep->mpa_attr.recv_marker_enabled = markers_enabled; ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; ep->mpa_attr.version = mpa->revision; ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED; @@ -2004,12 +2035,17 @@ static int send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid) } /* - * Return whether a failed active open has allocated a TID + * Some of the error codes above implicitly indicate that there is no TID + * allocated with the result of an ACT_OPEN. We use this predicate to make + * that explicit. */ static inline int act_open_has_tid(int status) { - return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && - status != CPL_ERR_ARP_MISS; + return (status != CPL_ERR_TCAM_PARITY && + status != CPL_ERR_TCAM_MISS && + status != CPL_ERR_TCAM_FULL && + status != CPL_ERR_CONN_EXIST_SYNRECV && + status != CPL_ERR_CONN_EXIST); } /* Returns whether a CPL status conveys negative advice. @@ -2130,6 +2166,7 @@ out: static int c4iw_reconnect(struct c4iw_ep *ep) { int err = 0; + int size = 0; struct sockaddr_in *laddr = (struct sockaddr_in *) &ep->com.cm_id->m_local_addr; struct sockaddr_in *raddr = (struct sockaddr_in *) @@ -2145,6 +2182,21 @@ static int c4iw_reconnect(struct c4iw_ep *ep) init_timer(&ep->timer); c4iw_init_wr_wait(&ep->com.wr_wait); + /* When MPA revision is different on nodes, the node with MPA_rev=2 + * tries to reconnect with MPA_rev 1 for the same EP through + * c4iw_reconnect(), where the same EP is assigned with new tid for + * further connection establishment. As we are using the same EP pointer + * for reconnect, few skbs are used during the previous c4iw_connect(), + * which leaves the EP with inadequate skbs for further + * c4iw_reconnect(), Further causing an assert BUG_ON() due to empty + * skb_list() during peer_abort(). Allocate skbs which is already used. + */ + size = (CN_MAX_CON_BUF - skb_queue_len(&ep->com.ep_skb_list)); + if (alloc_ep_skb_list(&ep->com.ep_skb_list, size)) { + err = -ENOMEM; + goto fail1; + } + /* * Allocate an active TID to initiate a TCP connection. */ @@ -2210,6 +2262,7 @@ fail2: * response of 1st connect request. */ connect_reply_upcall(ep, -ECONNRESET); +fail1: c4iw_put_ep(&ep->com); out: return err; @@ -2576,6 +2629,10 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) if (peer_mss && child_ep->mtu > (peer_mss + hdrs)) child_ep->mtu = peer_mss + hdrs; + skb_queue_head_init(&child_ep->com.ep_skb_list); + if (alloc_ep_skb_list(&child_ep->com.ep_skb_list, CN_MAX_CON_BUF)) + goto fail; + state_set(&child_ep->com, CONNECTING); child_ep->com.dev = dev; child_ep->com.cm_id = NULL; @@ -2640,6 +2697,8 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) (const u32 *)&sin6->sin6_addr.s6_addr, 1); } goto out; +fail: + c4iw_put_ep(&child_ep->com); reject: reject_cr(dev, hwtid, skb); if (parent_ep) @@ -2670,7 +2729,7 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb) ep->com.state = MPA_REQ_WAIT; start_ep_timer(ep); set_bit(PASS_ESTAB, &ep->com.history); - ret = send_flowc(ep, skb); + ret = send_flowc(ep); mutex_unlock(&ep->com.mutex); if (ret) c4iw_ep_disconnect(ep, 1, GFP_KERNEL); @@ -2871,10 +2930,8 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) } mutex_unlock(&ep->com.mutex); - rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL); - if (!rpl_skb) { - printk(KERN_ERR MOD "%s - cannot allocate skb!\n", - __func__); + rpl_skb = skb_dequeue(&ep->com.ep_skb_list); + if (WARN_ON(!rpl_skb)) { release = 1; goto out; } @@ -3011,9 +3068,9 @@ static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb) PDBG("%s last streaming msg ack ep %p tid %u state %u " "initiator %u freeing skb\n", __func__, ep, ep->hwtid, state_read(&ep->com), ep->mpa_attr.initiator ? 1 : 0); + mutex_lock(&ep->com.mutex); kfree_skb(ep->mpa_skb); ep->mpa_skb = NULL; - mutex_lock(&ep->com.mutex); if (test_bit(STOP_MPA_TIMER, &ep->com.flags)) stop_ep_timer(ep); mutex_unlock(&ep->com.mutex); @@ -3025,9 +3082,9 @@ out: int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) { - int err = 0; - int disconnect = 0; + int abort; struct c4iw_ep *ep = to_ep(cm_id); + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); mutex_lock(&ep->com.mutex); @@ -3038,16 +3095,13 @@ int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) } set_bit(ULP_REJECT, &ep->com.history); if (mpa_rev == 0) - disconnect = 2; - else { - err = send_mpa_reject(ep, pdata, pdata_len); - disconnect = 1; - } + abort = 1; + else + abort = send_mpa_reject(ep, pdata, pdata_len); mutex_unlock(&ep->com.mutex); - if (disconnect) { - stop_ep_timer(ep); - err = c4iw_ep_disconnect(ep, disconnect == 2, GFP_KERNEL); - } + + stop_ep_timer(ep); + c4iw_ep_disconnect(ep, abort != 0, GFP_KERNEL); c4iw_put_ep(&ep->com); return 0; } @@ -3248,6 +3302,13 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) err = -ENOMEM; goto out; } + + skb_queue_head_init(&ep->com.ep_skb_list); + if (alloc_ep_skb_list(&ep->com.ep_skb_list, CN_MAX_CON_BUF)) { + err = -ENOMEM; + goto fail1; + } + init_timer(&ep->timer); ep->plen = conn_param->private_data_len; if (ep->plen) @@ -3266,7 +3327,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (!ep->com.qp) { PDBG("%s qpn 0x%x not found!\n", __func__, conn_param->qpn); err = -EINVAL; - goto fail1; + goto fail2; } ref_qp(ep); PDBG("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn, @@ -3279,7 +3340,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (ep->atid == -1) { printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__); err = -ENOMEM; - goto fail1; + goto fail2; } insert_handle(dev, &dev->atid_idr, ep, ep->atid); @@ -3303,7 +3364,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (raddr->sin_addr.s_addr == htonl(INADDR_ANY)) { err = pick_local_ipaddrs(dev, cm_id); if (err) - goto fail1; + goto fail2; } /* find a route */ @@ -3323,7 +3384,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (ipv6_addr_type(&raddr6->sin6_addr) == IPV6_ADDR_ANY) { err = pick_local_ip6addrs(dev, cm_id); if (err) - goto fail1; + goto fail2; } /* find a route */ @@ -3339,14 +3400,14 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (!ep->dst) { printk(KERN_ERR MOD "%s - cannot find route.\n", __func__); err = -EHOSTUNREACH; - goto fail2; + goto fail3; } err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, true, ep->com.dev->rdev.lldi.adapter_type, cm_id->tos); if (err) { printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__); - goto fail3; + goto fail4; } PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n", @@ -3362,13 +3423,15 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) goto out; cxgb4_l2t_release(ep->l2t); -fail3: +fail4: dst_release(ep->dst); -fail2: +fail3: remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); -fail1: +fail2: + skb_queue_purge(&ep->com.ep_skb_list); deref_cm_id(&ep->com); +fail1: c4iw_put_ep(&ep->com); out: return err; @@ -3461,6 +3524,7 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) err = -ENOMEM; goto fail1; } + skb_queue_head_init(&ep->com.ep_skb_list); PDBG("%s ep %p\n", __func__, ep); ep->com.cm_id = cm_id; ref_cm_id(&ep->com); @@ -3577,11 +3641,22 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) case MPA_REQ_RCVD: case MPA_REP_SENT: case FPDU_MODE: + case CONNECTING: close = 1; if (abrupt) ep->com.state = ABORTING; else { ep->com.state = CLOSING; + + /* + * if we close before we see the fw4_ack() then we fix + * up the timer state since we're reusing it. + */ + if (ep->mpa_skb && + test_bit(STOP_MPA_TIMER, &ep->com.flags)) { + clear_bit(STOP_MPA_TIMER, &ep->com.flags); + stop_ep_timer(ep); + } start_ep_timer(ep); } set_bit(CLOSE_SENT, &ep->com.flags); @@ -3611,10 +3686,10 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) if (abrupt) { set_bit(EP_DISC_ABORT, &ep->com.history); close_complete_upcall(ep, -ECONNRESET); - ret = send_abort(ep, NULL, gfp); + ret = send_abort(ep); } else { set_bit(EP_DISC_CLOSE, &ep->com.history); - ret = send_halfclose(ep, gfp); + ret = send_halfclose(ep); } if (ret) { set_bit(EP_DISC_FAIL, &ep->com.history); diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index b0b955724458..812ab7278b8e 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -33,19 +33,15 @@ #include "iw_cxgb4.h" static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, - struct c4iw_dev_ucontext *uctx) + struct c4iw_dev_ucontext *uctx, struct sk_buff *skb) { struct fw_ri_res_wr *res_wr; struct fw_ri_res *res; int wr_len; struct c4iw_wr_wait wr_wait; - struct sk_buff *skb; int ret; wr_len = sizeof *res_wr + sizeof *res; - skb = alloc_skb(wr_len, GFP_KERNEL); - if (!skb) - return -ENOMEM; set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len); @@ -863,7 +859,9 @@ int c4iw_destroy_cq(struct ib_cq *ib_cq) ucontext = ib_cq->uobject ? to_c4iw_ucontext(ib_cq->uobject->context) : NULL; destroy_cq(&chp->rhp->rdev, &chp->cq, - ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx); + ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx, + chp->destroy_skb); + chp->destroy_skb = NULL; kfree(chp); return 0; } @@ -879,7 +877,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, struct c4iw_cq *chp; struct c4iw_create_cq_resp uresp; struct c4iw_ucontext *ucontext = NULL; - int ret; + int ret, wr_len; size_t memsize, hwentries; struct c4iw_mm_entry *mm, *mm2; @@ -896,6 +894,13 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, if (!chp) return ERR_PTR(-ENOMEM); + wr_len = sizeof(struct fw_ri_res_wr) + sizeof(struct fw_ri_res); + chp->destroy_skb = alloc_skb(wr_len, GFP_KERNEL); + if (!chp->destroy_skb) { + ret = -ENOMEM; + goto err1; + } + if (ib_context) ucontext = to_c4iw_ucontext(ib_context); @@ -936,7 +941,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, ret = create_cq(&rhp->rdev, &chp->cq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx); if (ret) - goto err1; + goto err2; chp->rhp = rhp; chp->cq.size--; /* status page */ @@ -947,15 +952,15 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, init_waitqueue_head(&chp->wait); ret = insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid); if (ret) - goto err2; + goto err3; if (ucontext) { mm = kmalloc(sizeof *mm, GFP_KERNEL); if (!mm) - goto err3; + goto err4; mm2 = kmalloc(sizeof *mm2, GFP_KERNEL); if (!mm2) - goto err4; + goto err5; uresp.qid_mask = rhp->rdev.cqmask; uresp.cqid = chp->cq.cqid; @@ -970,7 +975,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp) - sizeof(uresp.reserved)); if (ret) - goto err5; + goto err6; mm->key = uresp.key; mm->addr = virt_to_phys(chp->cq.queue); @@ -986,15 +991,18 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, __func__, chp->cq.cqid, chp, chp->cq.size, chp->cq.memsize, (unsigned long long) chp->cq.dma_addr); return &chp->ibcq; -err5: +err6: kfree(mm2); -err4: +err5: kfree(mm); -err3: +err4: remove_handle(rhp, &rhp->cqidr, chp->cq.cqid); -err2: +err3: destroy_cq(&chp->rhp->rdev, &chp->cq, - ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + ucontext ? &ucontext->uctx : &rhp->rdev.uctx, + chp->destroy_skb); +err2: + kfree_skb(chp->destroy_skb); err1: kfree(chp); return ERR_PTR(ret); diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index ae2e8b23d2dd..071d7332ec06 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -317,7 +317,7 @@ static int qp_open(struct inode *inode, struct file *file) idr_for_each(&qpd->devp->qpidr, count_idrs, &count); spin_unlock_irq(&qpd->devp->lock); - qpd->bufsize = count * 128; + qpd->bufsize = count * 180; qpd->buf = vmalloc(qpd->bufsize); if (!qpd->buf) { kfree(qpd); diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index f6f34a75af27..aa47e0ae80bc 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -384,6 +384,7 @@ struct c4iw_mr { struct ib_mr ibmr; struct ib_umem *umem; struct c4iw_dev *rhp; + struct sk_buff *dereg_skb; u64 kva; struct tpt_attributes attr; u64 *mpl; @@ -400,6 +401,7 @@ static inline struct c4iw_mr *to_c4iw_mr(struct ib_mr *ibmr) struct c4iw_mw { struct ib_mw ibmw; struct c4iw_dev *rhp; + struct sk_buff *dereg_skb; u64 kva; struct tpt_attributes attr; }; @@ -412,6 +414,7 @@ static inline struct c4iw_mw *to_c4iw_mw(struct ib_mw *ibmw) struct c4iw_cq { struct ib_cq ibcq; struct c4iw_dev *rhp; + struct sk_buff *destroy_skb; struct t4_cq cq; spinlock_t lock; spinlock_t comp_handler_lock; @@ -472,7 +475,7 @@ struct c4iw_qp { struct t4_wq wq; spinlock_t lock; struct mutex mutex; - atomic_t refcnt; + struct kref kref; wait_queue_head_t wait; struct timer_list timer; int sq_sig_all; @@ -789,10 +792,29 @@ enum c4iw_ep_history { CM_ID_DEREFED = 28, }; +enum conn_pre_alloc_buffers { + CN_ABORT_REQ_BUF, + CN_ABORT_RPL_BUF, + CN_CLOSE_CON_REQ_BUF, + CN_DESTROY_BUF, + CN_FLOWC_BUF, + CN_MAX_CON_BUF +}; + +#define FLOWC_LEN 80 +union cpl_wr_size { + struct cpl_abort_req abrt_req; + struct cpl_abort_rpl abrt_rpl; + struct fw_ri_wr ri_req; + struct cpl_close_con_req close_req; + char flowc_buf[FLOWC_LEN]; +}; + struct c4iw_ep_common { struct iw_cm_id *cm_id; struct c4iw_qp *qp; struct c4iw_dev *dev; + struct sk_buff_head ep_skb_list; enum c4iw_ep_state state; struct kref kref; struct mutex mutex; diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 55d0651ee4de..0b91b0f4df71 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -59,9 +59,9 @@ static int mr_exceeds_hw_limits(struct c4iw_dev *dev, u64 length) } static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr, - u32 len, dma_addr_t data, int wait) + u32 len, dma_addr_t data, + int wait, struct sk_buff *skb) { - struct sk_buff *skb; struct ulp_mem_io *req; struct ulptx_sgl *sgl; u8 wr_len; @@ -74,9 +74,11 @@ static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr, c4iw_init_wr_wait(&wr_wait); wr_len = roundup(sizeof(*req) + sizeof(*sgl), 16); - skb = alloc_skb(wr_len, GFP_KERNEL); - if (!skb) - return -ENOMEM; + if (!skb) { + skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL); + if (!skb) + return -ENOMEM; + } set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); req = (struct ulp_mem_io *)__skb_put(skb, wr_len); @@ -108,9 +110,8 @@ static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr, } static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len, - void *data) + void *data, struct sk_buff *skb) { - struct sk_buff *skb; struct ulp_mem_io *req; struct ulptx_idata *sc; u8 wr_len, *to_dp, *from_dp; @@ -134,9 +135,11 @@ static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len, wr_len = roundup(sizeof *req + sizeof *sc + roundup(copy_len, T4_ULPTX_MIN_IO), 16); - skb = alloc_skb(wr_len, GFP_KERNEL); - if (!skb) - return -ENOMEM; + if (!skb) { + skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL); + if (!skb) + return -ENOMEM; + } set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); req = (struct ulp_mem_io *)__skb_put(skb, wr_len); @@ -173,6 +176,7 @@ static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len, memset(to_dp + copy_len, 0, T4_ULPTX_MIN_IO - (copy_len % T4_ULPTX_MIN_IO)); ret = c4iw_ofld_send(rdev, skb); + skb = NULL; if (ret) return ret; len -= C4IW_MAX_INLINE_SIZE; @@ -182,7 +186,8 @@ static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len, return ret; } -static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *data) +static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, + void *data, struct sk_buff *skb) { u32 remain = len; u32 dmalen; @@ -205,7 +210,7 @@ static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void * dmalen = T4_ULPTX_MAX_DMA; remain -= dmalen; ret = _c4iw_write_mem_dma_aligned(rdev, addr, dmalen, daddr, - !remain); + !remain, skb); if (ret) goto out; addr += dmalen >> 5; @@ -213,7 +218,7 @@ static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void * daddr += dmalen; } if (remain) - ret = _c4iw_write_mem_inline(rdev, addr, remain, data); + ret = _c4iw_write_mem_inline(rdev, addr, remain, data, skb); out: dma_unmap_single(&rdev->lldi.pdev->dev, save, len, DMA_TO_DEVICE); return ret; @@ -224,23 +229,25 @@ out: * If data is NULL, clear len byte of memory to zero. */ static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len, - void *data) + void *data, struct sk_buff *skb) { if (is_t5(rdev->lldi.adapter_type) && use_dsgl) { if (len > inline_threshold) { - if (_c4iw_write_mem_dma(rdev, addr, len, data)) { + if (_c4iw_write_mem_dma(rdev, addr, len, data, skb)) { printk_ratelimited(KERN_WARNING "%s: dma map" " failure (non fatal)\n", pci_name(rdev->lldi.pdev)); return _c4iw_write_mem_inline(rdev, addr, len, - data); - } else + data, skb); + } else { return 0; + } } else - return _c4iw_write_mem_inline(rdev, addr, len, data); + return _c4iw_write_mem_inline(rdev, addr, + len, data, skb); } else - return _c4iw_write_mem_inline(rdev, addr, len, data); + return _c4iw_write_mem_inline(rdev, addr, len, data, skb); } /* @@ -253,7 +260,8 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry, u32 *stag, u8 stag_state, u32 pdid, enum fw_ri_stag_type type, enum fw_ri_mem_perms perm, int bind_enabled, u32 zbva, u64 to, - u64 len, u8 page_size, u32 pbl_size, u32 pbl_addr) + u64 len, u8 page_size, u32 pbl_size, u32 pbl_addr, + struct sk_buff *skb) { int err; struct fw_ri_tpte tpt; @@ -307,7 +315,7 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry, } err = write_adapter_mem(rdev, stag_idx + (rdev->lldi.vr->stag.start >> 5), - sizeof(tpt), &tpt); + sizeof(tpt), &tpt, skb); if (reset_tpt_entry) { c4iw_put_resource(&rdev->resource.tpt_table, stag_idx); @@ -327,28 +335,29 @@ static int write_pbl(struct c4iw_rdev *rdev, __be64 *pbl, __func__, pbl_addr, rdev->lldi.vr->pbl.start, pbl_size); - err = write_adapter_mem(rdev, pbl_addr >> 5, pbl_size << 3, pbl); + err = write_adapter_mem(rdev, pbl_addr >> 5, pbl_size << 3, pbl, NULL); return err; } static int dereg_mem(struct c4iw_rdev *rdev, u32 stag, u32 pbl_size, - u32 pbl_addr) + u32 pbl_addr, struct sk_buff *skb) { return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0, - pbl_size, pbl_addr); + pbl_size, pbl_addr, skb); } static int allocate_window(struct c4iw_rdev *rdev, u32 * stag, u32 pdid) { *stag = T4_STAG_UNSET; return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_MW, 0, 0, 0, - 0UL, 0, 0, 0, 0); + 0UL, 0, 0, 0, 0, NULL); } -static int deallocate_window(struct c4iw_rdev *rdev, u32 stag) +static int deallocate_window(struct c4iw_rdev *rdev, u32 stag, + struct sk_buff *skb) { return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0, 0, - 0); + 0, skb); } static int allocate_stag(struct c4iw_rdev *rdev, u32 *stag, u32 pdid, @@ -356,7 +365,7 @@ static int allocate_stag(struct c4iw_rdev *rdev, u32 *stag, u32 pdid, { *stag = T4_STAG_UNSET; return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_NSMR, 0, 0, 0, - 0UL, 0, 0, pbl_size, pbl_addr); + 0UL, 0, 0, pbl_size, pbl_addr, NULL); } static int finish_mem_reg(struct c4iw_mr *mhp, u32 stag) @@ -383,14 +392,16 @@ static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php, mhp->attr.mw_bind_enable, mhp->attr.zbva, mhp->attr.va_fbo, mhp->attr.len ? mhp->attr.len : -1, shift - 12, - mhp->attr.pbl_size, mhp->attr.pbl_addr); + mhp->attr.pbl_size, mhp->attr.pbl_addr, NULL); if (ret) return ret; ret = finish_mem_reg(mhp, stag); - if (ret) + if (ret) { dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, - mhp->attr.pbl_addr); + mhp->attr.pbl_addr, mhp->dereg_skb); + mhp->dereg_skb = NULL; + } return ret; } @@ -423,6 +434,12 @@ struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc) if (!mhp) return ERR_PTR(-ENOMEM); + mhp->dereg_skb = alloc_skb(SGE_MAX_WR_LEN, GFP_KERNEL); + if (!mhp->dereg_skb) { + ret = -ENOMEM; + goto err0; + } + mhp->rhp = rhp; mhp->attr.pdid = php->pdid; mhp->attr.perms = c4iw_ib_to_tpt_access(acc); @@ -435,7 +452,8 @@ struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc) ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, php->pdid, FW_RI_STAG_NSMR, mhp->attr.perms, - mhp->attr.mw_bind_enable, 0, 0, ~0ULL, 0, 0, 0); + mhp->attr.mw_bind_enable, 0, 0, ~0ULL, 0, 0, 0, + NULL); if (ret) goto err1; @@ -445,8 +463,10 @@ struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc) return &mhp->ibmr; err2: dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, - mhp->attr.pbl_addr); + mhp->attr.pbl_addr, mhp->dereg_skb); err1: + kfree_skb(mhp->dereg_skb); +err0: kfree(mhp); return ERR_PTR(ret); } @@ -481,11 +501,18 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mhp) return ERR_PTR(-ENOMEM); + mhp->dereg_skb = alloc_skb(SGE_MAX_WR_LEN, GFP_KERNEL); + if (!mhp->dereg_skb) { + kfree(mhp); + return ERR_PTR(-ENOMEM); + } + mhp->rhp = rhp; mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0); if (IS_ERR(mhp->umem)) { err = PTR_ERR(mhp->umem); + kfree_skb(mhp->dereg_skb); kfree(mhp); return ERR_PTR(err); } @@ -550,6 +577,7 @@ err_pbl: err: ib_umem_release(mhp->umem); + kfree_skb(mhp->dereg_skb); kfree(mhp); return ERR_PTR(err); } @@ -572,11 +600,16 @@ struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); if (!mhp) return ERR_PTR(-ENOMEM); - ret = allocate_window(&rhp->rdev, &stag, php->pdid); - if (ret) { - kfree(mhp); - return ERR_PTR(ret); + + mhp->dereg_skb = alloc_skb(SGE_MAX_WR_LEN, GFP_KERNEL); + if (!mhp->dereg_skb) { + ret = -ENOMEM; + goto free_mhp; } + + ret = allocate_window(&rhp->rdev, &stag, php->pdid); + if (ret) + goto free_skb; mhp->rhp = rhp; mhp->attr.pdid = php->pdid; mhp->attr.type = FW_RI_STAG_MW; @@ -584,12 +617,19 @@ struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, mmid = (stag) >> 8; mhp->ibmw.rkey = stag; if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) { - deallocate_window(&rhp->rdev, mhp->attr.stag); - kfree(mhp); - return ERR_PTR(-ENOMEM); + ret = -ENOMEM; + goto dealloc_win; } PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag); return &(mhp->ibmw); + +dealloc_win: + deallocate_window(&rhp->rdev, mhp->attr.stag, mhp->dereg_skb); +free_skb: + kfree_skb(mhp->dereg_skb); +free_mhp: + kfree(mhp); + return ERR_PTR(ret); } int c4iw_dealloc_mw(struct ib_mw *mw) @@ -602,7 +642,8 @@ int c4iw_dealloc_mw(struct ib_mw *mw) rhp = mhp->rhp; mmid = (mw->rkey) >> 8; remove_handle(rhp, &rhp->mmidr, mmid); - deallocate_window(&rhp->rdev, mhp->attr.stag); + deallocate_window(&rhp->rdev, mhp->attr.stag, mhp->dereg_skb); + kfree_skb(mhp->dereg_skb); kfree(mhp); PDBG("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp); return 0; @@ -666,7 +707,7 @@ struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd, return &(mhp->ibmr); err3: dereg_mem(&rhp->rdev, stag, mhp->attr.pbl_size, - mhp->attr.pbl_addr); + mhp->attr.pbl_addr, mhp->dereg_skb); err2: c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr, mhp->attr.pbl_size << 3); @@ -717,7 +758,7 @@ int c4iw_dereg_mr(struct ib_mr *ib_mr) dma_free_coherent(&mhp->rhp->rdev.lldi.pdev->dev, mhp->max_mpl_len, mhp->mpl, mhp->mpl_addr); dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, - mhp->attr.pbl_addr); + mhp->attr.pbl_addr, mhp->dereg_skb); if (mhp->attr.pbl_size) c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr, mhp->attr.pbl_size << 3); diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index dd8a86b726d2..df127ce6b6ec 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -409,20 +409,6 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr, CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type)); } -static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, - ibdev.dev); - PDBG("%s dev 0x%p\n", __func__, dev); - - return sprintf(buf, "%u.%u.%u.%u\n", - FW_HDR_FW_VER_MAJOR_G(c4iw_dev->rdev.lldi.fw_vers), - FW_HDR_FW_VER_MINOR_G(c4iw_dev->rdev.lldi.fw_vers), - FW_HDR_FW_VER_MICRO_G(c4iw_dev->rdev.lldi.fw_vers), - FW_HDR_FW_VER_BUILD_G(c4iw_dev->rdev.lldi.fw_vers)); -} - static ssize_t show_hca(struct device *dev, struct device_attribute *attr, char *buf) { @@ -502,13 +488,11 @@ static int c4iw_get_mib(struct ib_device *ibdev, } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static struct device_attribute *c4iw_class_attributes[] = { &dev_attr_hw_rev, - &dev_attr_fw_ver, &dev_attr_hca_type, &dev_attr_board_id, }; @@ -530,6 +514,20 @@ static int c4iw_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } +static void get_dev_fw_str(struct ib_device *dev, char *str, + size_t str_len) +{ + struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, + ibdev); + PDBG("%s dev 0x%p\n", __func__, dev); + + snprintf(str, str_len, "%u.%u.%u.%u", + FW_HDR_FW_VER_MAJOR_G(c4iw_dev->rdev.lldi.fw_vers), + FW_HDR_FW_VER_MINOR_G(c4iw_dev->rdev.lldi.fw_vers), + FW_HDR_FW_VER_MICRO_G(c4iw_dev->rdev.lldi.fw_vers), + FW_HDR_FW_VER_BUILD_G(c4iw_dev->rdev.lldi.fw_vers)); +} + int c4iw_register_device(struct c4iw_dev *dev) { int ret; @@ -605,6 +603,7 @@ int c4iw_register_device(struct c4iw_dev *dev) dev->ibdev.get_hw_stats = c4iw_get_mib; dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION; dev->ibdev.get_port_immutable = c4iw_port_immutable; + dev->ibdev.get_dev_fw_str = get_dev_fw_str; dev->ibdev.drain_sq = c4iw_drain_sq; dev->ibdev.drain_rq = c4iw_drain_rq; diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index e8993e49b8b3..edb1172b6f54 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -683,17 +683,25 @@ static int build_inv_stag(union t4_wr *wqe, struct ib_send_wr *wr, return 0; } +void _free_qp(struct kref *kref) +{ + struct c4iw_qp *qhp; + + qhp = container_of(kref, struct c4iw_qp, kref); + PDBG("%s qhp %p\n", __func__, qhp); + kfree(qhp); +} + void c4iw_qp_add_ref(struct ib_qp *qp) { PDBG("%s ib_qp %p\n", __func__, qp); - atomic_inc(&(to_c4iw_qp(qp)->refcnt)); + kref_get(&to_c4iw_qp(qp)->kref); } void c4iw_qp_rem_ref(struct ib_qp *qp) { PDBG("%s ib_qp %p\n", __func__, qp); - if (atomic_dec_and_test(&(to_c4iw_qp(qp)->refcnt))) - wake_up(&(to_c4iw_qp(qp)->wait)); + kref_put(&to_c4iw_qp(qp)->kref, _free_qp); } static void add_to_fc_list(struct list_head *head, struct list_head *entry) @@ -1081,9 +1089,10 @@ static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe, PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid, qhp->ep->hwtid); - skb = alloc_skb(sizeof *wqe, gfp); - if (!skb) + skb = skb_dequeue(&qhp->ep->com.ep_skb_list); + if (WARN_ON(!skb)) return; + set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx); wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe)); @@ -1202,9 +1211,10 @@ static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp, PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid, ep->hwtid); - skb = alloc_skb(sizeof *wqe, GFP_KERNEL); - if (!skb) + skb = skb_dequeue(&ep->com.ep_skb_list); + if (WARN_ON(!skb)) return -ENOMEM; + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe)); @@ -1592,8 +1602,6 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp) wait_event(qhp->wait, !qhp->ep); remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid); - atomic_dec(&qhp->refcnt); - wait_event(qhp->wait, !atomic_read(&qhp->refcnt)); spin_lock_irq(&rhp->lock); if (!list_empty(&qhp->db_fc_entry)) @@ -1606,8 +1614,9 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp) destroy_qp(&rhp->rdev, &qhp->wq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + c4iw_qp_rem_ref(ib_qp); + PDBG("%s ib_qp %p qpid 0x%0x\n", __func__, ib_qp, qhp->wq.sq.qid); - kfree(qhp); return 0; } @@ -1704,7 +1713,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, init_completion(&qhp->rq_drained); mutex_init(&qhp->mutex); init_waitqueue_head(&qhp->wait); - atomic_set(&qhp->refcnt, 1); + kref_init(&qhp->kref); ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid); if (ret) @@ -1896,12 +1905,20 @@ int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, return 0; } +static void move_qp_to_err(struct c4iw_qp *qp) +{ + struct c4iw_qp_attributes attrs = { .next_state = C4IW_QP_STATE_ERROR }; + + (void)c4iw_modify_qp(qp->rhp, qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); +} + void c4iw_drain_sq(struct ib_qp *ibqp) { struct c4iw_qp *qp = to_c4iw_qp(ibqp); unsigned long flag; bool need_to_wait; + move_qp_to_err(qp); spin_lock_irqsave(&qp->lock, flag); need_to_wait = !t4_sq_empty(&qp->wq); spin_unlock_irqrestore(&qp->lock, flag); @@ -1916,6 +1933,7 @@ void c4iw_drain_rq(struct ib_qp *ibqp) unsigned long flag; bool need_to_wait; + move_qp_to_err(qp); spin_lock_irqsave(&qp->lock, flag); need_to_wait = !t4_rq_empty(&qp->wq); spin_unlock_irqrestore(&qp->lock, flag); diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig index a925fb0db706..f6ea0881765a 100644 --- a/drivers/infiniband/hw/hfi1/Kconfig +++ b/drivers/infiniband/hw/hfi1/Kconfig @@ -1,9 +1,9 @@ config INFINIBAND_HFI1 tristate "Intel OPA Gen1 support" - depends on X86_64 && INFINIBAND_RDMAVT + depends on X86_64 && INFINIBAND_RDMAVT && I2C select MMU_NOTIFIER select CRC32 - default m + select I2C_ALGOBIT ---help--- This is a low-level driver for Intel OPA Gen1 adapter. config HFI1_DEBUG_SDMA_ORDER diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index 9b5382c94b0c..0cf97a09b64b 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o hfi1-y := affinity.o chip.o device.o driver.o efivar.o \ eprom.o file_ops.o firmware.o \ init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \ - qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o twsi.o \ + qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \ uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \ verbs_txreq.o hfi1-$(CONFIG_DEBUG_FS) += debugfs.o diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index 14d7eeb09be6..79575ee873f2 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -47,12 +47,18 @@ #include <linux/topology.h> #include <linux/cpumask.h> #include <linux/module.h> +#include <linux/cpumask.h> #include "hfi.h" #include "affinity.h" #include "sdma.h" #include "trace.h" +struct hfi1_affinity_node_list node_affinity = { + .list = LIST_HEAD_INIT(node_affinity.list), + .lock = __SPIN_LOCK_UNLOCKED(&node_affinity.lock), +}; + /* Name of IRQ types, indexed by enum irq_type */ static const char * const irq_type_names[] = { "SDMA", @@ -61,6 +67,9 @@ static const char * const irq_type_names[] = { "OTHER", }; +/* Per NUMA node count of HFI devices */ +static unsigned int *hfi1_per_node_cntr; + static inline void init_cpu_mask_set(struct cpu_mask_set *set) { cpumask_clear(&set->mask); @@ -69,47 +78,136 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set) } /* Initialize non-HT cpu cores mask */ -int init_real_cpu_mask(struct hfi1_devdata *dd) +void init_real_cpu_mask(void) { - struct hfi1_affinity *info; int possible, curr_cpu, i, ht; - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return -ENOMEM; - - cpumask_clear(&info->real_cpu_mask); + cpumask_clear(&node_affinity.real_cpu_mask); /* Start with cpu online mask as the real cpu mask */ - cpumask_copy(&info->real_cpu_mask, cpu_online_mask); + cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask); /* * Remove HT cores from the real cpu mask. Do this in two steps below. */ - possible = cpumask_weight(&info->real_cpu_mask); + possible = cpumask_weight(&node_affinity.real_cpu_mask); ht = cpumask_weight(topology_sibling_cpumask( - cpumask_first(&info->real_cpu_mask))); + cpumask_first(&node_affinity.real_cpu_mask))); /* * Step 1. Skip over the first N HT siblings and use them as the * "real" cores. Assumes that HT cores are not enumerated in * succession (except in the single core case). */ - curr_cpu = cpumask_first(&info->real_cpu_mask); + curr_cpu = cpumask_first(&node_affinity.real_cpu_mask); for (i = 0; i < possible / ht; i++) - curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask); + curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); /* * Step 2. Remove the remaining HT siblings. Use cpumask_next() to * skip any gaps. */ for (; i < possible; i++) { - cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask); - curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask); + cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask); + curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); + } +} + +int node_affinity_init(void) +{ + int node; + struct pci_dev *dev = NULL; + const struct pci_device_id *ids = hfi1_pci_tbl; + + cpumask_clear(&node_affinity.proc.used); + cpumask_copy(&node_affinity.proc.mask, cpu_online_mask); + + node_affinity.proc.gen = 0; + node_affinity.num_core_siblings = + cpumask_weight(topology_sibling_cpumask( + cpumask_first(&node_affinity.proc.mask) + )); + node_affinity.num_online_nodes = num_online_nodes(); + node_affinity.num_online_cpus = num_online_cpus(); + + /* + * The real cpu mask is part of the affinity struct but it has to be + * initialized early. It is needed to calculate the number of user + * contexts in set_up_context_variables(). + */ + init_real_cpu_mask(); + + hfi1_per_node_cntr = kcalloc(num_possible_nodes(), + sizeof(*hfi1_per_node_cntr), GFP_KERNEL); + if (!hfi1_per_node_cntr) + return -ENOMEM; + + while (ids->vendor) { + dev = NULL; + while ((dev = pci_get_device(ids->vendor, ids->device, dev))) { + node = pcibus_to_node(dev->bus); + if (node < 0) + node = numa_node_id(); + + hfi1_per_node_cntr[node]++; + } + ids++; } - dd->affinity = info; return 0; } +void node_affinity_destroy(void) +{ + struct list_head *pos, *q; + struct hfi1_affinity_node *entry; + + spin_lock(&node_affinity.lock); + list_for_each_safe(pos, q, &node_affinity.list) { + entry = list_entry(pos, struct hfi1_affinity_node, + list); + list_del(pos); + kfree(entry); + } + spin_unlock(&node_affinity.lock); + kfree(hfi1_per_node_cntr); +} + +static struct hfi1_affinity_node *node_affinity_allocate(int node) +{ + struct hfi1_affinity_node *entry; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return NULL; + entry->node = node; + INIT_LIST_HEAD(&entry->list); + + return entry; +} + +/* + * It appends an entry to the list. + * It *must* be called with node_affinity.lock held. + */ +static void node_affinity_add_tail(struct hfi1_affinity_node *entry) +{ + list_add_tail(&entry->list, &node_affinity.list); +} + +/* It must be called with node_affinity.lock held */ +static struct hfi1_affinity_node *node_affinity_lookup(int node) +{ + struct list_head *pos; + struct hfi1_affinity_node *entry; + + list_for_each(pos, &node_affinity.list) { + entry = list_entry(pos, struct hfi1_affinity_node, list); + if (entry->node == node) + return entry; + } + + return NULL; +} + /* * Interrupt affinity. * @@ -121,10 +219,10 @@ int init_real_cpu_mask(struct hfi1_devdata *dd) * to the node relative 1 as necessary. * */ -void hfi1_dev_affinity_init(struct hfi1_devdata *dd) +int hfi1_dev_affinity_init(struct hfi1_devdata *dd) { int node = pcibus_to_node(dd->pcidev->bus); - struct hfi1_affinity *info = dd->affinity; + struct hfi1_affinity_node *entry; const struct cpumask *local_mask; int curr_cpu, possible, i; @@ -132,56 +230,93 @@ void hfi1_dev_affinity_init(struct hfi1_devdata *dd) node = numa_node_id(); dd->node = node; - spin_lock_init(&info->lock); - - init_cpu_mask_set(&info->def_intr); - init_cpu_mask_set(&info->rcv_intr); - init_cpu_mask_set(&info->proc); - local_mask = cpumask_of_node(dd->node); if (cpumask_first(local_mask) >= nr_cpu_ids) local_mask = topology_core_cpumask(0); - /* Use the "real" cpu mask of this node as the default */ - cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask); - - /* fill in the receive list */ - possible = cpumask_weight(&info->def_intr.mask); - curr_cpu = cpumask_first(&info->def_intr.mask); - if (possible == 1) { - /* only one CPU, everyone will use it */ - cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask); - } else { - /* - * Retain the first CPU in the default list for the control - * context. - */ - curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask); - /* - * Remove the remaining kernel receive queues from - * the default list and add them to the receive list. - */ - for (i = 0; i < dd->n_krcv_queues - 1; i++) { - cpumask_clear_cpu(curr_cpu, &info->def_intr.mask); - cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask); - curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask); - if (curr_cpu >= nr_cpu_ids) - break; + + spin_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + spin_unlock(&node_affinity.lock); + + /* + * If this is the first time this NUMA node's affinity is used, + * create an entry in the global affinity structure and initialize it. + */ + if (!entry) { + entry = node_affinity_allocate(node); + if (!entry) { + dd_dev_err(dd, + "Unable to allocate global affinity node\n"); + return -ENOMEM; } - } + init_cpu_mask_set(&entry->def_intr); + init_cpu_mask_set(&entry->rcv_intr); + cpumask_clear(&entry->general_intr_mask); + /* Use the "real" cpu mask of this node as the default */ + cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask, + local_mask); + + /* fill in the receive list */ + possible = cpumask_weight(&entry->def_intr.mask); + curr_cpu = cpumask_first(&entry->def_intr.mask); + + if (possible == 1) { + /* only one CPU, everyone will use it */ + cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask); + cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); + } else { + /* + * The general/control context will be the first CPU in + * the default list, so it is removed from the default + * list and added to the general interrupt list. + */ + cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask); + cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); + curr_cpu = cpumask_next(curr_cpu, + &entry->def_intr.mask); - cpumask_copy(&info->proc.mask, cpu_online_mask); -} + /* + * Remove the remaining kernel receive queues from + * the default list and add them to the receive list. + */ + for (i = 0; + i < (dd->n_krcv_queues - 1) * + hfi1_per_node_cntr[dd->node]; + i++) { + cpumask_clear_cpu(curr_cpu, + &entry->def_intr.mask); + cpumask_set_cpu(curr_cpu, + &entry->rcv_intr.mask); + curr_cpu = cpumask_next(curr_cpu, + &entry->def_intr.mask); + if (curr_cpu >= nr_cpu_ids) + break; + } -void hfi1_dev_affinity_free(struct hfi1_devdata *dd) -{ - kfree(dd->affinity); + /* + * If there ends up being 0 CPU cores leftover for SDMA + * engines, use the same CPU cores as general/control + * context. + */ + if (cpumask_weight(&entry->def_intr.mask) == 0) + cpumask_copy(&entry->def_intr.mask, + &entry->general_intr_mask); + } + + spin_lock(&node_affinity.lock); + node_affinity_add_tail(entry); + spin_unlock(&node_affinity.lock); + } + + return 0; } int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) { int ret; cpumask_var_t diff; - struct cpu_mask_set *set; + struct hfi1_affinity_node *entry; + struct cpu_mask_set *set = NULL; struct sdma_engine *sde = NULL; struct hfi1_ctxtdata *rcd = NULL; char extra[64]; @@ -194,22 +329,25 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) if (!ret) return -ENOMEM; + spin_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + spin_unlock(&node_affinity.lock); + switch (msix->type) { case IRQ_SDMA: sde = (struct sdma_engine *)msix->arg; scnprintf(extra, 64, "engine %u", sde->this_idx); - /* fall through */ + set = &entry->def_intr; + break; case IRQ_GENERAL: - set = &dd->affinity->def_intr; + cpu = cpumask_first(&entry->general_intr_mask); break; case IRQ_RCVCTXT: rcd = (struct hfi1_ctxtdata *)msix->arg; - if (rcd->ctxt == HFI1_CTRL_CTXT) { - set = &dd->affinity->def_intr; - cpu = cpumask_first(&set->mask); - } else { - set = &dd->affinity->rcv_intr; - } + if (rcd->ctxt == HFI1_CTRL_CTXT) + cpu = cpumask_first(&entry->general_intr_mask); + else + set = &entry->rcv_intr; scnprintf(extra, 64, "ctxt %u", rcd->ctxt); break; default: @@ -218,12 +356,12 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) } /* - * The control receive context is placed on a particular CPU, which - * is set above. Skip accounting for it. Everything else finds its - * CPU here. + * The general and control contexts are placed on a particular + * CPU, which is set above. Skip accounting for it. Everything else + * finds its CPU here. */ - if (cpu == -1) { - spin_lock(&dd->affinity->lock); + if (cpu == -1 && set) { + spin_lock(&node_affinity.lock); if (cpumask_equal(&set->mask, &set->used)) { /* * We've used up all the CPUs, bump up the generation @@ -235,7 +373,7 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) cpumask_andnot(diff, &set->mask, &set->used); cpu = cpumask_first(diff); cpumask_set_cpu(cpu, &set->used); - spin_unlock(&dd->affinity->lock); + spin_unlock(&node_affinity.lock); } switch (msix->type) { @@ -263,43 +401,84 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd, { struct cpu_mask_set *set = NULL; struct hfi1_ctxtdata *rcd; + struct hfi1_affinity_node *entry; + + spin_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + spin_unlock(&node_affinity.lock); switch (msix->type) { case IRQ_SDMA: + set = &entry->def_intr; + break; case IRQ_GENERAL: - set = &dd->affinity->def_intr; + /* Don't do accounting for general contexts */ break; case IRQ_RCVCTXT: rcd = (struct hfi1_ctxtdata *)msix->arg; - /* only do accounting for non control contexts */ + /* Don't do accounting for control contexts */ if (rcd->ctxt != HFI1_CTRL_CTXT) - set = &dd->affinity->rcv_intr; + set = &entry->rcv_intr; break; default: return; } if (set) { - spin_lock(&dd->affinity->lock); + spin_lock(&node_affinity.lock); cpumask_andnot(&set->used, &set->used, &msix->mask); if (cpumask_empty(&set->used) && set->gen) { set->gen--; cpumask_copy(&set->used, &set->mask); } - spin_unlock(&dd->affinity->lock); + spin_unlock(&node_affinity.lock); } irq_set_affinity_hint(msix->msix.vector, NULL); cpumask_clear(&msix->mask); } -int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node) +/* This should be called with node_affinity.lock held */ +static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask, + struct hfi1_affinity_node_list *affinity) { - int cpu = -1, ret; - cpumask_var_t diff, mask, intrs; + int possible, curr_cpu, i; + uint num_cores_per_socket = node_affinity.num_online_cpus / + affinity->num_core_siblings / + node_affinity.num_online_nodes; + + cpumask_copy(hw_thread_mask, &affinity->proc.mask); + if (affinity->num_core_siblings > 0) { + /* Removing other siblings not needed for now */ + possible = cpumask_weight(hw_thread_mask); + curr_cpu = cpumask_first(hw_thread_mask); + for (i = 0; + i < num_cores_per_socket * node_affinity.num_online_nodes; + i++) + curr_cpu = cpumask_next(curr_cpu, hw_thread_mask); + + for (; i < possible; i++) { + cpumask_clear_cpu(curr_cpu, hw_thread_mask); + curr_cpu = cpumask_next(curr_cpu, hw_thread_mask); + } + + /* Identifying correct HW threads within physical cores */ + cpumask_shift_left(hw_thread_mask, hw_thread_mask, + num_cores_per_socket * + node_affinity.num_online_nodes * + hw_thread_no); + } +} + +int hfi1_get_proc_affinity(int node) +{ + int cpu = -1, ret, i; + struct hfi1_affinity_node *entry; + cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask; const struct cpumask *node_mask, *proc_mask = tsk_cpus_allowed(current); - struct cpu_mask_set *set = &dd->affinity->proc; + struct hfi1_affinity_node_list *affinity = &node_affinity; + struct cpu_mask_set *set = &affinity->proc; /* * check whether process/context affinity has already @@ -325,22 +504,41 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node) /* * The process does not have a preset CPU affinity so find one to - * recommend. We prefer CPUs on the same NUMA as the device. + * recommend using the following algorithm: + * + * For each user process that is opening a context on HFI Y: + * a) If all cores are filled, reinitialize the bitmask + * b) Fill real cores first, then HT cores (First set of HT + * cores on all physical cores, then second set of HT core, + * and, so on) in the following order: + * + * 1. Same NUMA node as HFI Y and not running an IRQ + * handler + * 2. Same NUMA node as HFI Y and running an IRQ handler + * 3. Different NUMA node to HFI Y and not running an IRQ + * handler + * 4. Different NUMA node to HFI Y and running an IRQ + * handler + * c) Mark core as filled in the bitmask. As user processes are + * done, clear cores from the bitmask. */ ret = zalloc_cpumask_var(&diff, GFP_KERNEL); if (!ret) goto done; - ret = zalloc_cpumask_var(&mask, GFP_KERNEL); + ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL); if (!ret) goto free_diff; - ret = zalloc_cpumask_var(&intrs, GFP_KERNEL); + ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL); + if (!ret) + goto free_hw_thread_mask; + ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL); if (!ret) - goto free_mask; + goto free_available_mask; - spin_lock(&dd->affinity->lock); + spin_lock(&affinity->lock); /* - * If we've used all available CPUs, clear the mask and start + * If we've used all available HW threads, clear the mask and start * overloading. */ if (cpumask_equal(&set->mask, &set->used)) { @@ -348,81 +546,198 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node) cpumask_clear(&set->used); } - /* CPUs used by interrupt handlers */ - cpumask_copy(intrs, (dd->affinity->def_intr.gen ? - &dd->affinity->def_intr.mask : - &dd->affinity->def_intr.used)); - cpumask_or(intrs, intrs, (dd->affinity->rcv_intr.gen ? - &dd->affinity->rcv_intr.mask : - &dd->affinity->rcv_intr.used)); + /* + * If NUMA node has CPUs used by interrupt handlers, include them in the + * interrupt handler mask. + */ + entry = node_affinity_lookup(node); + if (entry) { + cpumask_copy(intrs_mask, (entry->def_intr.gen ? + &entry->def_intr.mask : + &entry->def_intr.used)); + cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ? + &entry->rcv_intr.mask : + &entry->rcv_intr.used)); + cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask); + } hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl", - cpumask_pr_args(intrs)); + cpumask_pr_args(intrs_mask)); + + cpumask_copy(hw_thread_mask, &set->mask); /* - * If we don't have a NUMA node requested, preference is towards - * device NUMA node + * If HT cores are enabled, identify which HW threads within the + * physical cores should be used. */ - if (node == -1) - node = dd->node; + if (affinity->num_core_siblings > 0) { + for (i = 0; i < affinity->num_core_siblings; i++) { + find_hw_thread_mask(i, hw_thread_mask, affinity); + + /* + * If there's at least one available core for this HW + * thread number, stop looking for a core. + * + * diff will always be not empty at least once in this + * loop as the used mask gets reset when + * (set->mask == set->used) before this loop. + */ + cpumask_andnot(diff, hw_thread_mask, &set->used); + if (!cpumask_empty(diff)) + break; + } + } + hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl", + cpumask_pr_args(hw_thread_mask)); + node_mask = cpumask_of_node(node); - hfi1_cdbg(PROC, "device on NUMA %u, CPUs %*pbl", node, + hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node, cpumask_pr_args(node_mask)); - /* diff will hold all unused cpus */ - cpumask_andnot(diff, &set->mask, &set->used); - hfi1_cdbg(PROC, "unused CPUs (all) %*pbl", cpumask_pr_args(diff)); - - /* get cpumask of available CPUs on preferred NUMA */ - cpumask_and(mask, diff, node_mask); - hfi1_cdbg(PROC, "available cpus on NUMA %*pbl", cpumask_pr_args(mask)); + /* Get cpumask of available CPUs on preferred NUMA */ + cpumask_and(available_mask, hw_thread_mask, node_mask); + cpumask_andnot(available_mask, available_mask, &set->used); + hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node, + cpumask_pr_args(available_mask)); /* * At first, we don't want to place processes on the same - * CPUs as interrupt handlers. + * CPUs as interrupt handlers. Then, CPUs running interrupt + * handlers are used. + * + * 1) If diff is not empty, then there are CPUs not running + * non-interrupt handlers available, so diff gets copied + * over to available_mask. + * 2) If diff is empty, then all CPUs not running interrupt + * handlers are taken, so available_mask contains all + * available CPUs running interrupt handlers. + * 3) If available_mask is empty, then all CPUs on the + * preferred NUMA node are taken, so other NUMA nodes are + * used for process assignments using the same method as + * the preferred NUMA node. */ - cpumask_andnot(diff, mask, intrs); + cpumask_andnot(diff, available_mask, intrs_mask); if (!cpumask_empty(diff)) - cpumask_copy(mask, diff); + cpumask_copy(available_mask, diff); - /* - * if we don't have a cpu on the preferred NUMA, get - * the list of the remaining available CPUs - */ - if (cpumask_empty(mask)) { - cpumask_andnot(diff, &set->mask, &set->used); - cpumask_andnot(mask, diff, node_mask); + /* If we don't have CPUs on the preferred node, use other NUMA nodes */ + if (cpumask_empty(available_mask)) { + cpumask_andnot(available_mask, hw_thread_mask, &set->used); + /* Excluding preferred NUMA cores */ + cpumask_andnot(available_mask, available_mask, node_mask); + hfi1_cdbg(PROC, + "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl", + cpumask_pr_args(available_mask)); + + /* + * At first, we don't want to place processes on the same + * CPUs as interrupt handlers. + */ + cpumask_andnot(diff, available_mask, intrs_mask); + if (!cpumask_empty(diff)) + cpumask_copy(available_mask, diff); } - hfi1_cdbg(PROC, "possible CPUs for process %*pbl", - cpumask_pr_args(mask)); + hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl", + cpumask_pr_args(available_mask)); - cpu = cpumask_first(mask); + cpu = cpumask_first(available_mask); if (cpu >= nr_cpu_ids) /* empty */ cpu = -1; else cpumask_set_cpu(cpu, &set->used); - spin_unlock(&dd->affinity->lock); - - free_cpumask_var(intrs); -free_mask: - free_cpumask_var(mask); + spin_unlock(&affinity->lock); + hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu); + + free_cpumask_var(intrs_mask); +free_available_mask: + free_cpumask_var(available_mask); +free_hw_thread_mask: + free_cpumask_var(hw_thread_mask); free_diff: free_cpumask_var(diff); done: return cpu; } -void hfi1_put_proc_affinity(struct hfi1_devdata *dd, int cpu) +void hfi1_put_proc_affinity(int cpu) { - struct cpu_mask_set *set = &dd->affinity->proc; + struct hfi1_affinity_node_list *affinity = &node_affinity; + struct cpu_mask_set *set = &affinity->proc; if (cpu < 0) return; - spin_lock(&dd->affinity->lock); + spin_lock(&affinity->lock); cpumask_clear_cpu(cpu, &set->used); + hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu); if (cpumask_empty(&set->used) && set->gen) { set->gen--; cpumask_copy(&set->used, &set->mask); } - spin_unlock(&dd->affinity->lock); + spin_unlock(&affinity->lock); } +/* Prevents concurrent reads and writes of the sdma_affinity attrib */ +static DEFINE_MUTEX(sdma_affinity_mutex); + +int hfi1_set_sdma_affinity(struct hfi1_devdata *dd, const char *buf, + size_t count) +{ + struct hfi1_affinity_node *entry; + struct cpumask mask; + int ret, i; + + spin_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + spin_unlock(&node_affinity.lock); + + if (!entry) + return -EINVAL; + + ret = cpulist_parse(buf, &mask); + if (ret) + return ret; + + if (!cpumask_subset(&mask, cpu_online_mask) || cpumask_empty(&mask)) { + dd_dev_warn(dd, "Invalid CPU mask\n"); + return -EINVAL; + } + + mutex_lock(&sdma_affinity_mutex); + /* reset the SDMA interrupt affinity details */ + init_cpu_mask_set(&entry->def_intr); + cpumask_copy(&entry->def_intr.mask, &mask); + /* + * Reassign the affinity for each SDMA interrupt. + */ + for (i = 0; i < dd->num_msix_entries; i++) { + struct hfi1_msix_entry *msix; + + msix = &dd->msix_entries[i]; + if (msix->type != IRQ_SDMA) + continue; + + ret = hfi1_get_irq_affinity(dd, msix); + + if (ret) + break; + } + + mutex_unlock(&sdma_affinity_mutex); + return ret ? ret : strnlen(buf, PAGE_SIZE); +} + +int hfi1_get_sdma_affinity(struct hfi1_devdata *dd, char *buf) +{ + struct hfi1_affinity_node *entry; + + spin_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + spin_unlock(&node_affinity.lock); + + if (!entry) + return -EINVAL; + + mutex_lock(&sdma_affinity_mutex); + cpumap_print_to_pagebuf(true, buf, &entry->def_intr.mask); + mutex_unlock(&sdma_affinity_mutex); + return strnlen(buf, PAGE_SIZE); +} diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h index 20f52fe74091..8879cf7a8cac 100644 --- a/drivers/infiniband/hw/hfi1/affinity.h +++ b/drivers/infiniband/hw/hfi1/affinity.h @@ -73,7 +73,6 @@ struct cpu_mask_set { struct hfi1_affinity { struct cpu_mask_set def_intr; struct cpu_mask_set rcv_intr; - struct cpu_mask_set proc; struct cpumask real_cpu_mask; /* spin lock to protect affinity struct */ spinlock_t lock; @@ -82,11 +81,9 @@ struct hfi1_affinity { struct hfi1_msix_entry; /* Initialize non-HT cpu cores mask */ -int init_real_cpu_mask(struct hfi1_devdata *); +void init_real_cpu_mask(void); /* Initialize driver affinity data */ -void hfi1_dev_affinity_init(struct hfi1_devdata *); -/* Free driver affinity data */ -void hfi1_dev_affinity_free(struct hfi1_devdata *); +int hfi1_dev_affinity_init(struct hfi1_devdata *); /* * Set IRQ affinity to a CPU. The function will determine the * CPU and set the affinity to it. @@ -101,8 +98,35 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *); * Determine a CPU affinity for a user process, if the process does not * have an affinity set yet. */ -int hfi1_get_proc_affinity(struct hfi1_devdata *, int); +int hfi1_get_proc_affinity(int); /* Release a CPU used by a user process. */ -void hfi1_put_proc_affinity(struct hfi1_devdata *, int); +void hfi1_put_proc_affinity(int); + +int hfi1_get_sdma_affinity(struct hfi1_devdata *dd, char *buf); +int hfi1_set_sdma_affinity(struct hfi1_devdata *dd, const char *buf, + size_t count); + +struct hfi1_affinity_node { + int node; + struct cpu_mask_set def_intr; + struct cpu_mask_set rcv_intr; + struct cpumask general_intr_mask; + struct list_head list; +}; + +struct hfi1_affinity_node_list { + struct list_head list; + struct cpumask real_cpu_mask; + struct cpu_mask_set proc; + int num_core_siblings; + int num_online_nodes; + int num_online_cpus; + /* protect affinity node list */ + spinlock_t lock; +}; + +int node_affinity_init(void); +void node_affinity_destroy(void); +extern struct hfi1_affinity_node_list node_affinity; #endif /* _HFI1_AFFINITY_H */ diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index dad4d0ebbdff..b32638d58ae8 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -63,6 +63,7 @@ #include "efivar.h" #include "platform.h" #include "aspm.h" +#include "affinity.h" #define NUM_IB_PORTS 1 @@ -121,6 +122,7 @@ struct flag_table { #define SEC_SC_HALTED 0x4 /* per-context only */ #define SEC_SPC_FREEZE 0x8 /* per-HFI only */ +#define DEFAULT_KRCVQS 2 #define MIN_KERNEL_KCTXTS 2 #define FIRST_KERNEL_KCTXT 1 /* sizes for both the QP and RSM map tables */ @@ -238,6 +240,9 @@ struct flag_table { /* all CceStatus sub-block RXE pause bits */ #define ALL_RXE_PAUSE CCE_STATUS_RXE_PAUSED_SMASK +#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL +#define CNTR_32BIT_MAX 0x00000000FFFFFFFF + /* * CCE Error flags. */ @@ -3947,6 +3952,28 @@ static u64 access_sdma_wrong_dw_err_cnt(const struct cntr_entry *entry, return dd->sw_send_dma_eng_err_status_cnt[0]; } +static u64 access_dc_rcv_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + u64 val = 0; + u64 csr = entry->csr; + + val = read_write_csr(dd, csr, mode, data); + if (mode == CNTR_MODE_R) { + val = val > CNTR_MAX - dd->sw_rcv_bypass_packet_errors ? + CNTR_MAX : val + dd->sw_rcv_bypass_packet_errors; + } else if (mode == CNTR_MODE_W) { + dd->sw_rcv_bypass_packet_errors = 0; + } else { + dd_dev_err(dd, "Invalid cntr register access mode"); + return 0; + } + return val; +} + #define def_access_sw_cpu(cntr) \ static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry, \ void *context, int vl, int mode, u64 data) \ @@ -4020,7 +4047,8 @@ static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = { CCE_SEND_CREDIT_INT_CNT, CNTR_NORMAL), [C_DC_UNC_ERR] = DC_PERF_CNTR(DcUnctblErr, DCC_ERR_UNCORRECTABLE_CNT, CNTR_SYNTH), -[C_DC_RCV_ERR] = DC_PERF_CNTR(DcRecvErr, DCC_ERR_PORTRCV_ERR_CNT, CNTR_SYNTH), +[C_DC_RCV_ERR] = CNTR_ELEM("DcRecvErr", DCC_ERR_PORTRCV_ERR_CNT, 0, CNTR_SYNTH, + access_dc_rcv_err_cnt), [C_DC_FM_CFG_ERR] = DC_PERF_CNTR(DcFmCfgErr, DCC_ERR_FMCONFIG_ERR_CNT, CNTR_SYNTH), [C_DC_RMT_PHY_ERR] = DC_PERF_CNTR(DcRmtPhyErr, DCC_ERR_RCVREMOTE_PHY_ERR_CNT, @@ -8798,30 +8826,6 @@ static int write_tx_settings(struct hfi1_devdata *dd, return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame); } -static void check_fabric_firmware_versions(struct hfi1_devdata *dd) -{ - u32 frame, version, prod_id; - int ret, lane; - - /* 4 lanes */ - for (lane = 0; lane < 4; lane++) { - ret = read_8051_config(dd, SPICO_FW_VERSION, lane, &frame); - if (ret) { - dd_dev_err(dd, - "Unable to read lane %d firmware details\n", - lane); - continue; - } - version = (frame >> SPICO_ROM_VERSION_SHIFT) - & SPICO_ROM_VERSION_MASK; - prod_id = (frame >> SPICO_ROM_PROD_ID_SHIFT) - & SPICO_ROM_PROD_ID_MASK; - dd_dev_info(dd, - "Lane %d firmware: version 0x%04x, prod_id 0x%04x\n", - lane, version, prod_id); - } -} - /* * Read an idle LCB message. * @@ -9187,17 +9191,24 @@ static void wait_for_qsfp_init(struct hfi1_pportdata *ppd) unsigned long timeout; /* - * Check for QSFP interrupt for t_init (SFF 8679) + * Some QSFP cables have a quirk that asserts the IntN line as a side + * effect of power up on plug-in. We ignore this false positive + * interrupt until the module has finished powering up by waiting for + * a minimum timeout of the module inrush initialization time of + * 500 ms (SFF 8679 Table 5-6) to ensure the voltage rails in the + * module have stabilized. + */ + msleep(500); + + /* + * Check for QSFP interrupt for t_init (SFF 8679 Table 8-1) */ timeout = jiffies + msecs_to_jiffies(2000); while (1) { mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_IN : ASIC_QSFP1_IN); - if (!(mask & QSFP_HFI0_INT_N)) { - write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : - ASIC_QSFP1_CLEAR, QSFP_HFI0_INT_N); + if (!(mask & QSFP_HFI0_INT_N)) break; - } if (time_after(jiffies, timeout)) { dd_dev_info(dd, "%s: No IntN detected, reset complete\n", __func__); @@ -9213,10 +9224,17 @@ static void set_qsfp_int_n(struct hfi1_pportdata *ppd, u8 enable) u64 mask; mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK); - if (enable) + if (enable) { + /* + * Clear the status register to avoid an immediate interrupt + * when we re-enable the IntN pin + */ + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : ASIC_QSFP1_CLEAR, + QSFP_HFI0_INT_N); mask |= (u64)QSFP_HFI0_INT_N; - else + } else { mask &= ~(u64)QSFP_HFI0_INT_N; + } write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, mask); } @@ -9630,14 +9648,6 @@ void hfi1_clear_tids(struct hfi1_ctxtdata *rcd) hfi1_put_tid(dd, i, PT_INVALID, 0, 0); } -int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd, - struct hfi1_ctxt_info *kinfo) -{ - kinfo->runtime_flags = (HFI1_MISC_GET() << HFI1_CAP_USER_SHIFT) | - HFI1_CAP_UGET(MASK) | HFI1_CAP_KGET(K2U); - return 0; -} - struct hfi1_message_header *hfi1_get_msgheader( struct hfi1_devdata *dd, __le32 *rhf_addr) { @@ -9890,6 +9900,131 @@ static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs) return 0; } +static const char *state_completed_string(u32 completed) +{ + static const char * const state_completed[] = { + "EstablishComm", + "OptimizeEQ", + "VerifyCap" + }; + + if (completed < ARRAY_SIZE(state_completed)) + return state_completed[completed]; + + return "unknown"; +} + +static const char all_lanes_dead_timeout_expired[] = + "All lanes were inactive – was the interconnect media removed?"; +static const char tx_out_of_policy[] = + "Passing lanes on local port do not meet the local link width policy"; +static const char no_state_complete[] = + "State timeout occurred before link partner completed the state"; +static const char * const state_complete_reasons[] = { + [0x00] = "Reason unknown", + [0x01] = "Link was halted by driver, refer to LinkDownReason", + [0x02] = "Link partner reported failure", + [0x10] = "Unable to achieve frame sync on any lane", + [0x11] = + "Unable to find a common bit rate with the link partner", + [0x12] = + "Unable to achieve frame sync on sufficient lanes to meet the local link width policy", + [0x13] = + "Unable to identify preset equalization on sufficient lanes to meet the local link width policy", + [0x14] = no_state_complete, + [0x15] = + "State timeout occurred before link partner identified equalization presets", + [0x16] = + "Link partner completed the EstablishComm state, but the passing lanes do not meet the local link width policy", + [0x17] = tx_out_of_policy, + [0x20] = all_lanes_dead_timeout_expired, + [0x21] = + "Unable to achieve acceptable BER on sufficient lanes to meet the local link width policy", + [0x22] = no_state_complete, + [0x23] = + "Link partner completed the OptimizeEq state, but the passing lanes do not meet the local link width policy", + [0x24] = tx_out_of_policy, + [0x30] = all_lanes_dead_timeout_expired, + [0x31] = + "State timeout occurred waiting for host to process received frames", + [0x32] = no_state_complete, + [0x33] = + "Link partner completed the VerifyCap state, but the passing lanes do not meet the local link width policy", + [0x34] = tx_out_of_policy, +}; + +static const char *state_complete_reason_code_string(struct hfi1_pportdata *ppd, + u32 code) +{ + const char *str = NULL; + + if (code < ARRAY_SIZE(state_complete_reasons)) + str = state_complete_reasons[code]; + + if (str) + return str; + return "Reserved"; +} + +/* describe the given last state complete frame */ +static void decode_state_complete(struct hfi1_pportdata *ppd, u32 frame, + const char *prefix) +{ + struct hfi1_devdata *dd = ppd->dd; + u32 success; + u32 state; + u32 reason; + u32 lanes; + + /* + * Decode frame: + * [ 0: 0] - success + * [ 3: 1] - state + * [ 7: 4] - next state timeout + * [15: 8] - reason code + * [31:16] - lanes + */ + success = frame & 0x1; + state = (frame >> 1) & 0x7; + reason = (frame >> 8) & 0xff; + lanes = (frame >> 16) & 0xffff; + + dd_dev_err(dd, "Last %s LNI state complete frame 0x%08x:\n", + prefix, frame); + dd_dev_err(dd, " last reported state state: %s (0x%x)\n", + state_completed_string(state), state); + dd_dev_err(dd, " state successfully completed: %s\n", + success ? "yes" : "no"); + dd_dev_err(dd, " fail reason 0x%x: %s\n", + reason, state_complete_reason_code_string(ppd, reason)); + dd_dev_err(dd, " passing lane mask: 0x%x", lanes); +} + +/* + * Read the last state complete frames and explain them. This routine + * expects to be called if the link went down during link negotiation + * and initialization (LNI). That is, anywhere between polling and link up. + */ +static void check_lni_states(struct hfi1_pportdata *ppd) +{ + u32 last_local_state; + u32 last_remote_state; + + read_last_local_state(ppd->dd, &last_local_state); + read_last_remote_state(ppd->dd, &last_remote_state); + + /* + * Don't report anything if there is nothing to report. A value of + * 0 means the link was taken down while polling and there was no + * training in-process. + */ + if (last_local_state == 0 && last_remote_state == 0) + return; + + decode_state_complete(ppd, last_local_state, "transmitted"); + decode_state_complete(ppd, last_remote_state, "received"); +} + /* * Helper for set_link_state(). Do not call except from that routine. * Expects ppd->hls_mutex to be held. @@ -9902,8 +10037,6 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) { struct hfi1_devdata *dd = ppd->dd; u32 pstate, previous_state; - u32 last_local_state; - u32 last_remote_state; int ret; int do_transition; int do_wait; @@ -10003,12 +10136,7 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) } else if (previous_state & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) { /* went down while attempting link up */ - /* byte 1 of last_*_state is the failure reason */ - read_last_local_state(dd, &last_local_state); - read_last_remote_state(dd, &last_remote_state); - dd_dev_err(dd, - "LNI failure last states: local 0x%08x, remote 0x%08x\n", - last_local_state, last_remote_state); + check_lni_states(ppd); } /* the active link width (downgrade) is 0 on link down */ @@ -11668,9 +11796,6 @@ static void free_cntrs(struct hfi1_devdata *dd) dd->cntrnames = NULL; } -#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL -#define CNTR_32BIT_MAX 0x00000000FFFFFFFF - static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry, u64 *psval, void *context, int vl) { @@ -12325,37 +12450,6 @@ u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd) return ib_pstate; } -/* - * Read/modify/write ASIC_QSFP register bits as selected by mask - * data: 0 or 1 in the positions depending on what needs to be written - * dir: 0 for read, 1 for write - * mask: select by setting - * I2CCLK (bit 0) - * I2CDATA (bit 1) - */ -u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir, - u32 mask) -{ - u64 qsfp_oe, target_oe; - - target_oe = target ? ASIC_QSFP2_OE : ASIC_QSFP1_OE; - if (mask) { - /* We are writing register bits, so lock access */ - dir &= mask; - data &= mask; - - qsfp_oe = read_csr(dd, target_oe); - qsfp_oe = (qsfp_oe & ~(u64)mask) | (u64)dir; - write_csr(dd, target_oe, qsfp_oe); - } - /* We are exclusively reading bits here, but it is unlikely - * we'll get valid data when we set the direction of the pin - * in the same call, so read should call this function again - * to get valid data - */ - return read_csr(dd, target ? ASIC_QSFP2_IN : ASIC_QSFP1_IN); -} - #define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \ (r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK) @@ -12780,7 +12874,6 @@ static int set_up_context_variables(struct hfi1_devdata *dd) /* * Kernel receive contexts: - * - min of 2 or 1 context/numa (excluding control context) * - Context 0 - control context (VL15/multicast/error) * - Context 1 - first kernel context * - Context 2 - second kernel context @@ -12794,9 +12887,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd) */ num_kernel_contexts = n_krcvqs + 1; else - num_kernel_contexts = num_online_nodes() + 1; - num_kernel_contexts = - max_t(int, MIN_KERNEL_KCTXTS, num_kernel_contexts); + num_kernel_contexts = DEFAULT_KRCVQS + 1; /* * Every kernel receive context needs an ACK send context. * one send context is allocated for each VL{0-7} and VL15 @@ -12815,7 +12906,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd) */ if (num_user_contexts < 0) num_user_contexts = - cpumask_weight(&dd->affinity->real_cpu_mask); + cpumask_weight(&node_affinity.real_cpu_mask); total_contexts = num_kernel_contexts + num_user_contexts; @@ -14141,6 +14232,11 @@ static int init_asic_data(struct hfi1_devdata *dd) } dd->asic_data->dds[dd->hfi1_id] = dd; /* self back-pointer */ spin_unlock_irqrestore(&hfi1_devs_lock, flags); + + /* first one through - set up i2c devices */ + if (!peer) + ret = set_up_i2c(dd, dd->asic_data); + return ret; } @@ -14445,19 +14541,6 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, (dd->revision >> CCE_REVISION_SW_SHIFT) & CCE_REVISION_SW_MASK); - /* - * The real cpu mask is part of the affinity struct but has to be - * initialized earlier than the rest of the affinity struct because it - * is needed to calculate the number of user contexts in - * set_up_context_variables(). However, hfi1_dev_affinity_init(), - * which initializes the rest of the affinity struct members, - * depends on set_up_context_variables() for the number of kernel - * contexts, so it cannot be called before set_up_context_variables(). - */ - ret = init_real_cpu_mask(dd); - if (ret) - goto bail_cleanup; - ret = set_up_context_variables(dd); if (ret) goto bail_cleanup; @@ -14471,7 +14554,9 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, /* set up KDETH QP prefix in both RX and TX CSRs */ init_kdeth_qp(dd); - hfi1_dev_affinity_init(dd); + ret = hfi1_dev_affinity_init(dd); + if (ret) + goto bail_cleanup; /* send contexts must be set up before receive contexts */ ret = init_send_contexts(dd); @@ -14508,8 +14593,14 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, /* set up LCB access - must be after set_up_interrupts() */ init_lcb_access(dd); + /* + * Serial number is created from the base guid: + * [27:24] = base guid [38:35] + * [23: 0] = base guid [23: 0] + */ snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n", - dd->base_guid & 0xFFFFFF); + (dd->base_guid & 0xFFFFFF) | + ((dd->base_guid >> 11) & 0xF000000)); dd->oui1 = dd->base_guid >> 56 & 0xFF; dd->oui2 = dd->base_guid >> 48 & 0xFF; @@ -14518,7 +14609,6 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, ret = load_firmware(dd); /* asymmetric with dispose_firmware() */ if (ret) goto bail_clear_intr; - check_fabric_firmware_versions(dd); thermal_init(dd); diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 66a327978739..ed11107c50fe 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -640,6 +640,7 @@ extern uint platform_config_load; /* SBus commands */ #define RESET_SBUS_RECEIVER 0x20 #define WRITE_SBUS_RECEIVER 0x21 +#define READ_SBUS_RECEIVER 0x22 void sbus_request(struct hfi1_devdata *dd, u8 receiver_addr, u8 data_addr, u8 command, u32 data_in); int sbus_request_slow(struct hfi1_devdata *dd, @@ -1336,10 +1337,6 @@ void hfi1_start_cleanup(struct hfi1_devdata *dd); void hfi1_clear_tids(struct hfi1_ctxtdata *rcd); struct hfi1_message_header *hfi1_get_msgheader( struct hfi1_devdata *dd, __le32 *rhf_addr); -int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd, - struct hfi1_ctxt_info *kinfo); -u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir, - u32 mask); int hfi1_init_ctxt(struct send_context *sc); void hfi1_put_tid(struct hfi1_devdata *dd, u32 index, u32 type, unsigned long pa, u16 order); diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h index 8744de6667c2..5b9993899789 100644 --- a/drivers/infiniband/hw/hfi1/chip_registers.h +++ b/drivers/infiniband/hw/hfi1/chip_registers.h @@ -471,6 +471,10 @@ #define ASIC_STS_SBUS_RESULT (ASIC + 0x000000000010) #define ASIC_STS_SBUS_RESULT_DONE_SMASK 0x1ull #define ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK 0x2ull +#define ASIC_STS_SBUS_RESULT_RESULT_CODE_SHIFT 2 +#define ASIC_STS_SBUS_RESULT_RESULT_CODE_MASK 0x7ull +#define ASIC_STS_SBUS_RESULT_DATA_OUT_SHIFT 32 +#define ASIC_STS_SBUS_RESULT_DATA_OUT_MASK 0xFFFFFFFFull #define ASIC_STS_THERM (ASIC + 0x000000000058) #define ASIC_STS_THERM_CRIT_TEMP_MASK 0x7FFull #define ASIC_STS_THERM_CRIT_TEMP_SHIFT 18 diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index c75b0ae688f8..8246dc7d0573 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -392,9 +392,7 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, u16 rlid; u8 svc_type, sl, sc5; - sc5 = (be16_to_cpu(rhdr->lrh[0]) >> 12) & 0xf; - if (rhf_dc_info(packet->rhf)) - sc5 |= 0x10; + sc5 = hdr2sc(rhdr, packet->rhf); sl = ibp->sc_to_sl[sc5]; lqpn = be32_to_cpu(bth[1]) & RVT_QPN_MASK; @@ -450,14 +448,20 @@ static inline void init_packet(struct hfi1_ctxtdata *rcd, packet->rcv_flags = 0; } -static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr, - struct hfi1_other_headers *ohdr, - u64 rhf, u32 bth1, struct ib_grh *grh) +void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt, + bool do_cnp) { struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); - u32 rqpn = 0; - u16 rlid; - u8 sc5, svc_type; + struct hfi1_ib_header *hdr = pkt->hdr; + struct hfi1_other_headers *ohdr = pkt->ohdr; + struct ib_grh *grh = NULL; + u32 rqpn = 0, bth1; + u16 rlid, dlid = be16_to_cpu(hdr->lrh[1]); + u8 sc, svc_type; + bool is_mcast = false; + + if (pkt->rcv_flags & HFI1_HAS_GRH) + grh = &hdr->u.l.grh; switch (qp->ibqp.qp_type) { case IB_QPT_SMI: @@ -466,6 +470,8 @@ static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr, rlid = be16_to_cpu(hdr->lrh[3]); rqpn = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK; svc_type = IB_CC_SVCTYPE_UD; + is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (dlid != be16_to_cpu(IB_LID_PERMISSIVE)); break; case IB_QPT_UC: rlid = qp->remote_ah_attr.dlid; @@ -481,24 +487,23 @@ static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr, return; } - sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf; - if (rhf_dc_info(rhf)) - sc5 |= 0x10; + sc = hdr2sc((struct hfi1_message_header *)hdr, pkt->rhf); - if (bth1 & HFI1_FECN_SMASK) { + bth1 = be32_to_cpu(ohdr->bth[1]); + if (do_cnp && (bth1 & HFI1_FECN_SMASK)) { u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]); - u16 dlid = be16_to_cpu(hdr->lrh[1]); - return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc5, grh); + return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc, grh); } - if (bth1 & HFI1_BECN_SMASK) { + if (!is_mcast && (bth1 & HFI1_BECN_SMASK)) { struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); u32 lqpn = bth1 & RVT_QPN_MASK; - u8 sl = ibp->sc_to_sl[sc5]; + u8 sl = ibp->sc_to_sl[sc]; process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type); } + } struct ps_mdata { @@ -596,7 +601,6 @@ static void __prescan_rxq(struct hfi1_packet *packet) struct rvt_qp *qp; struct hfi1_ib_header *hdr; struct hfi1_other_headers *ohdr; - struct ib_grh *grh = NULL; struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; u64 rhf = rhf_to_cpu(rhf_addr); u32 etype = rhf_rcv_type(rhf), qpn, bth1; @@ -616,14 +620,13 @@ static void __prescan_rxq(struct hfi1_packet *packet) hfi1_get_msgheader(dd, rhf_addr); lnh = be16_to_cpu(hdr->lrh[0]) & 3; - if (lnh == HFI1_LRH_BTH) { + if (lnh == HFI1_LRH_BTH) ohdr = &hdr->u.oth; - } else if (lnh == HFI1_LRH_GRH) { + else if (lnh == HFI1_LRH_GRH) ohdr = &hdr->u.l.oth; - grh = &hdr->u.l.grh; - } else { + else goto next; /* just in case */ - } + bth1 = be32_to_cpu(ohdr->bth[1]); is_ecn = !!(bth1 & (HFI1_FECN_SMASK | HFI1_BECN_SMASK)); @@ -639,7 +642,7 @@ static void __prescan_rxq(struct hfi1_packet *packet) goto next; } - process_ecn(qp, hdr, ohdr, rhf, bth1, grh); + process_ecn(qp, packet, true); rcu_read_unlock(); /* turn off BECN, FECN */ @@ -1362,6 +1365,7 @@ int process_receive_bypass(struct hfi1_packet *packet) dd_dev_err(packet->rcd->dd, "Bypass packets are not supported in normal operation. Dropping\n"); + incr_cntr64(&packet->rcd->dd->sw_rcv_bypass_packet_errors); return RHF_RCV_CONTINUE; } diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index c702a009608f..1ecbec192358 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -168,6 +168,7 @@ static inline int is_valid_mmap(u64 token) static int hfi1_file_open(struct inode *inode, struct file *fp) { + struct hfi1_filedata *fd; struct hfi1_devdata *dd = container_of(inode->i_cdev, struct hfi1_devdata, user_cdev); @@ -176,10 +177,17 @@ static int hfi1_file_open(struct inode *inode, struct file *fp) kobject_get(&dd->kobj); /* The real work is performed later in assign_ctxt() */ - fp->private_data = kzalloc(sizeof(struct hfi1_filedata), GFP_KERNEL); - if (fp->private_data) /* no cpu affinity by default */ - ((struct hfi1_filedata *)fp->private_data)->rec_cpu_num = -1; - return fp->private_data ? 0 : -ENOMEM; + + fd = kzalloc(sizeof(*fd), GFP_KERNEL); + + if (fd) { + fd->rec_cpu_num = -1; /* no cpu affinity by default */ + fd->mm = current->mm; + } + + fp->private_data = fd; + + return fd ? 0 : -ENOMEM; } static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, @@ -228,7 +236,7 @@ static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, sizeof(struct hfi1_base_info)); break; case HFI1_IOCTL_CREDIT_UPD: - if (uctxt && uctxt->sc) + if (uctxt) sc_return_credits(uctxt->sc); break; @@ -392,41 +400,38 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) struct hfi1_filedata *fd = kiocb->ki_filp->private_data; struct hfi1_user_sdma_pkt_q *pq = fd->pq; struct hfi1_user_sdma_comp_q *cq = fd->cq; - int ret = 0, done = 0, reqs = 0; + int done = 0, reqs = 0; unsigned long dim = from->nr_segs; - if (!cq || !pq) { - ret = -EIO; - goto done; - } + if (!cq || !pq) + return -EIO; - if (!iter_is_iovec(from) || !dim) { - ret = -EINVAL; - goto done; - } + if (!iter_is_iovec(from) || !dim) + return -EINVAL; hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)", fd->uctxt->ctxt, fd->subctxt, dim); - if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) { - ret = -ENOSPC; - goto done; - } + if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) + return -ENOSPC; while (dim) { + int ret; unsigned long count = 0; ret = hfi1_user_sdma_process_request( kiocb->ki_filp, (struct iovec *)(from->iov + done), dim, &count); - if (ret) - goto done; + if (ret) { + reqs = ret; + break; + } dim -= count; done += count; reqs++; } -done: - return ret ? ret : reqs; + + return reqs; } static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) @@ -718,7 +723,7 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) hfi1_user_sdma_free_queues(fdata); /* release the cpu */ - hfi1_put_proc_affinity(dd, fdata->rec_cpu_num); + hfi1_put_proc_affinity(fdata->rec_cpu_num); /* * Clear any left over, unhandled events so the next process that @@ -730,7 +735,6 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) if (--uctxt->cnt) { uctxt->active_slaves &= ~(1 << fdata->subctxt); - uctxt->subpid[fdata->subctxt] = 0; mutex_unlock(&hfi1_mutex); goto done; } @@ -756,7 +760,6 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE, hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type)); sc_disable(uctxt->sc); - uctxt->pid = 0; spin_unlock_irqrestore(&dd->uctxt_lock, flags); dd->rcd[uctxt->ctxt] = NULL; @@ -818,9 +821,10 @@ static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo) ret = find_shared_ctxt(fp, uinfo); if (ret < 0) goto done_unlock; - if (ret) - fd->rec_cpu_num = hfi1_get_proc_affinity( - fd->uctxt->dd, fd->uctxt->numa_id); + if (ret) { + fd->rec_cpu_num = + hfi1_get_proc_affinity(fd->uctxt->numa_id); + } } /* @@ -895,7 +899,6 @@ static int find_shared_ctxt(struct file *fp, } fd->uctxt = uctxt; fd->subctxt = uctxt->cnt++; - uctxt->subpid[fd->subctxt] = current->pid; uctxt->active_slaves |= 1 << fd->subctxt; ret = 1; goto done; @@ -932,7 +935,11 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd, if (ctxt == dd->num_rcv_contexts) return -EBUSY; - fd->rec_cpu_num = hfi1_get_proc_affinity(dd, -1); + /* + * If we don't have a NUMA node requested, preference is towards + * device NUMA node. + */ + fd->rec_cpu_num = hfi1_get_proc_affinity(dd->node); if (fd->rec_cpu_num != -1) numa = cpu_to_node(fd->rec_cpu_num); else @@ -976,8 +983,7 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd, return ret; } uctxt->userversion = uinfo->userversion; - uctxt->pid = current->pid; - uctxt->flags = HFI1_CAP_UGET(MASK); + uctxt->flags = hfi1_cap_mask; /* save current flag state */ init_waitqueue_head(&uctxt->wait); strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm)); memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid)); @@ -1080,18 +1086,18 @@ static int user_init(struct file *fp) hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey); rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB; - if (HFI1_CAP_KGET_MASK(uctxt->flags, HDRSUPP)) + if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP)) rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB; /* * Ignore the bit in the flags for now until proper * support for multiple packet per rcv array entry is * added. */ - if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR)) + if (!HFI1_CAP_UGET_MASK(uctxt->flags, MULTI_PKT_EGR)) rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; - if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL)) + if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_EGR_FULL)) rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; - if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL)) + if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_RHQ_FULL)) rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; /* * The RcvCtxtCtrl.TailUpd bit has to be explicitly written. @@ -1099,7 +1105,7 @@ static int user_init(struct file *fp) * uses of the chip or ctxt. Therefore, add the rcvctrl op * for both cases. */ - if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL)) + if (HFI1_CAP_UGET_MASK(uctxt->flags, DMA_RTAIL)) rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB; else rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS; @@ -1122,9 +1128,14 @@ static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len) int ret = 0; memset(&cinfo, 0, sizeof(cinfo)); - ret = hfi1_get_base_kinfo(uctxt, &cinfo); - if (ret < 0) - goto done; + cinfo.runtime_flags = (((uctxt->flags >> HFI1_CAP_MISC_SHIFT) & + HFI1_CAP_MISC_MASK) << HFI1_CAP_USER_SHIFT) | + HFI1_CAP_UGET_MASK(uctxt->flags, MASK) | + HFI1_CAP_KGET_MASK(uctxt->flags, K2U); + /* adjust flag if this fd is not able to cache */ + if (!fd->handler) + cinfo.runtime_flags |= HFI1_CAP_TID_UNMAP; /* no caching */ + cinfo.num_active = hfi1_count_active_units(); cinfo.unit = uctxt->dd->unit; cinfo.ctxt = uctxt->ctxt; @@ -1146,7 +1157,7 @@ static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len) trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, cinfo); if (copy_to_user(ubase, &cinfo, sizeof(cinfo))) ret = -EFAULT; -done: + return ret; } diff --git a/drivers/infiniband/hw/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c index ed680fda611d..13db8eb4f4ec 100644 --- a/drivers/infiniband/hw/hfi1/firmware.c +++ b/drivers/infiniband/hw/hfi1/firmware.c @@ -206,6 +206,9 @@ static const struct firmware *platform_config; /* the number of fabric SerDes on the SBus */ #define NUM_FABRIC_SERDES 4 +/* ASIC_STS_SBUS_RESULT.RESULT_CODE value */ +#define SBUS_READ_COMPLETE 0x4 + /* SBus fabric SerDes addresses, one set per HFI */ static const u8 fabric_serdes_addrs[2][NUM_FABRIC_SERDES] = { { 0x01, 0x02, 0x03, 0x04 }, @@ -240,6 +243,7 @@ static const u8 all_pcie_serdes_broadcast = 0xe0; static void dispose_one_firmware(struct firmware_details *fdet); static int load_fabric_serdes_firmware(struct hfi1_devdata *dd, struct firmware_details *fdet); +static void dump_fw_version(struct hfi1_devdata *dd); /* * Read a single 64-bit value from 8051 data memory. @@ -1079,6 +1083,44 @@ void sbus_request(struct hfi1_devdata *dd, } /* + * Read a value from the SBus. + * + * Requires the caller to be in fast mode + */ +static u32 sbus_read(struct hfi1_devdata *dd, u8 receiver_addr, u8 data_addr, + u32 data_in) +{ + u64 reg; + int retries; + int success = 0; + u32 result = 0; + u32 result_code = 0; + + sbus_request(dd, receiver_addr, data_addr, READ_SBUS_RECEIVER, data_in); + + for (retries = 0; retries < 100; retries++) { + usleep_range(1000, 1200); /* arbitrary */ + reg = read_csr(dd, ASIC_STS_SBUS_RESULT); + result_code = (reg >> ASIC_STS_SBUS_RESULT_RESULT_CODE_SHIFT) + & ASIC_STS_SBUS_RESULT_RESULT_CODE_MASK; + if (result_code != SBUS_READ_COMPLETE) + continue; + + success = 1; + result = (reg >> ASIC_STS_SBUS_RESULT_DATA_OUT_SHIFT) + & ASIC_STS_SBUS_RESULT_DATA_OUT_MASK; + break; + } + + if (!success) { + dd_dev_err(dd, "%s: read failed, result code 0x%x\n", __func__, + result_code); + } + + return result; +} + +/* * Turn off the SBus and fabric serdes spicos. * * + Must be called with Sbus fast mode turned on. @@ -1636,6 +1678,7 @@ int load_firmware(struct hfi1_devdata *dd) return ret; } + dump_fw_version(dd); return 0; } @@ -2054,3 +2097,85 @@ void read_guid(struct hfi1_devdata *dd) dd_dev_info(dd, "GUID %llx", (unsigned long long)dd->base_guid); } + +/* read and display firmware version info */ +static void dump_fw_version(struct hfi1_devdata *dd) +{ + u32 pcie_vers[NUM_PCIE_SERDES]; + u32 fabric_vers[NUM_FABRIC_SERDES]; + u32 sbus_vers; + int i; + int all_same; + int ret; + u8 rcv_addr; + + ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT); + if (ret) { + dd_dev_err(dd, "Unable to acquire SBus to read firmware versions\n"); + return; + } + + /* set fast mode */ + set_sbus_fast_mode(dd); + + /* read version for SBus Master */ + sbus_request(dd, SBUS_MASTER_BROADCAST, 0x02, WRITE_SBUS_RECEIVER, 0); + sbus_request(dd, SBUS_MASTER_BROADCAST, 0x07, WRITE_SBUS_RECEIVER, 0x1); + /* wait for interrupt to be processed */ + usleep_range(10000, 11000); + sbus_vers = sbus_read(dd, SBUS_MASTER_BROADCAST, 0x08, 0x1); + dd_dev_info(dd, "SBus Master firmware version 0x%08x\n", sbus_vers); + + /* read version for PCIe SerDes */ + all_same = 1; + pcie_vers[0] = 0; + for (i = 0; i < NUM_PCIE_SERDES; i++) { + rcv_addr = pcie_serdes_addrs[dd->hfi1_id][i]; + sbus_request(dd, rcv_addr, 0x03, WRITE_SBUS_RECEIVER, 0); + /* wait for interrupt to be processed */ + usleep_range(10000, 11000); + pcie_vers[i] = sbus_read(dd, rcv_addr, 0x04, 0x0); + if (i > 0 && pcie_vers[0] != pcie_vers[i]) + all_same = 0; + } + + if (all_same) { + dd_dev_info(dd, "PCIe SerDes firmware version 0x%x\n", + pcie_vers[0]); + } else { + dd_dev_warn(dd, "PCIe SerDes do not have the same firmware version\n"); + for (i = 0; i < NUM_PCIE_SERDES; i++) { + dd_dev_info(dd, + "PCIe SerDes lane %d firmware version 0x%x\n", + i, pcie_vers[i]); + } + } + + /* read version for fabric SerDes */ + all_same = 1; + fabric_vers[0] = 0; + for (i = 0; i < NUM_FABRIC_SERDES; i++) { + rcv_addr = fabric_serdes_addrs[dd->hfi1_id][i]; + sbus_request(dd, rcv_addr, 0x03, WRITE_SBUS_RECEIVER, 0); + /* wait for interrupt to be processed */ + usleep_range(10000, 11000); + fabric_vers[i] = sbus_read(dd, rcv_addr, 0x04, 0x0); + if (i > 0 && fabric_vers[0] != fabric_vers[i]) + all_same = 0; + } + + if (all_same) { + dd_dev_info(dd, "Fabric SerDes firmware version 0x%x\n", + fabric_vers[0]); + } else { + dd_dev_warn(dd, "Fabric SerDes do not have the same firmware version\n"); + for (i = 0; i < NUM_FABRIC_SERDES; i++) { + dd_dev_info(dd, + "Fabric SerDes lane %d firmware version 0x%x\n", + i, fabric_vers[i]); + } + } + + clear_sbus_fast_mode(dd); + release_chip_resource(dd, CR_SBUS); +} diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 4417a0fd3ef9..1000e0fd96d9 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -62,6 +62,8 @@ #include <linux/cdev.h> #include <linux/delay.h> #include <linux/kthread.h> +#include <linux/i2c.h> +#include <linux/i2c-algo-bit.h> #include <rdma/rdma_vt.h> #include "chip_registers.h" @@ -253,7 +255,7 @@ struct hfi1_ctxtdata { /* chip offset of PIO buffers for this ctxt */ u32 piobufs; /* per-context configuration flags */ - u32 flags; + unsigned long flags; /* per-context event flags for fileops/intr communication */ unsigned long event_flags; /* WAIT_RCV that timed out, no interrupt */ @@ -268,9 +270,6 @@ struct hfi1_ctxtdata { u32 urgent; /* saved total number of polled urgent packets for poll edge trigger */ u32 urgent_poll; - /* pid of process using this ctxt */ - pid_t pid; - pid_t subpid[HFI1_MAX_SHARED_CTXTS]; /* same size as task_struct .comm[], command that opened context */ char comm[TASK_COMM_LEN]; /* so file ops can get at unit */ @@ -366,11 +365,6 @@ struct hfi1_packet { u8 etype; }; -static inline bool has_sc4_bit(struct hfi1_packet *p) -{ - return !!rhf_dc_info(p->rhf); -} - /* * Private data for snoop/capture support. */ @@ -805,10 +799,19 @@ struct hfi1_temp { u8 triggers; /* temperature triggers */ }; +struct hfi1_i2c_bus { + struct hfi1_devdata *controlling_dd; /* current controlling device */ + struct i2c_adapter adapter; /* bus details */ + struct i2c_algo_bit_data algo; /* bus algorithm details */ + int num; /* bus number, 0 or 1 */ +}; + /* common data between shared ASIC HFIs */ struct hfi1_asic_data { struct hfi1_devdata *dds[2]; /* back pointers */ struct mutex asic_resource_mutex; + struct hfi1_i2c_bus *i2c_bus0; + struct hfi1_i2c_bus *i2c_bus1; }; /* device data struct now contains only "general per-device" info. @@ -1128,7 +1131,8 @@ struct hfi1_devdata { NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS]; /* Software counter that aggregates all cce_err_status errors */ u64 sw_cce_err_status_aggregate; - + /* Software counter that aggregates all bypass packet rcv errors */ + u64 sw_rcv_bypass_packet_errors; /* receive interrupt functions */ rhf_rcv_function_ptr *rhf_rcv_function_map; rhf_rcv_function_ptr normal_rhf_rcv_functions[8]; @@ -1174,6 +1178,8 @@ struct hfi1_devdata { /* 8051 firmware version helper */ #define dc8051_ver(a, b) ((a) << 8 | (b)) +#define dc8051_ver_maj(a) ((a & 0xff00) >> 8) +#define dc8051_ver_min(a) (a & 0x00ff) /* f_put_tid types */ #define PT_EXPECTED 0 @@ -1182,6 +1188,7 @@ struct hfi1_devdata { struct tid_rb_node; struct mmu_rb_node; +struct mmu_rb_handler; /* Private data for file operations */ struct hfi1_filedata { @@ -1192,7 +1199,7 @@ struct hfi1_filedata { /* for cpu affinity; -1 if none */ int rec_cpu_num; u32 tid_n_pinned; - struct rb_root tid_rb_root; + struct mmu_rb_handler *handler; struct tid_rb_node **entry_to_rb; spinlock_t tid_lock; /* protect tid_[limit,used] counters */ u32 tid_limit; @@ -1201,6 +1208,7 @@ struct hfi1_filedata { u32 invalid_tid_idx; /* protect invalid_tids array and invalid_tid_idx */ spinlock_t invalid_lock; + struct mm_struct *mm; }; extern struct list_head hfi1_dev_list; @@ -1234,6 +1242,8 @@ int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *, int); int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *, int); void set_all_slowpath(struct hfi1_devdata *dd); +extern const struct pci_device_id hfi1_pci_tbl[]; + /* receive packet handler dispositions */ #define RCV_PKT_OK 0x0 /* keep going */ #define RCV_PKT_LIMIT 0x1 /* stop, hit limit, start thread */ @@ -1259,7 +1269,7 @@ void receive_interrupt_work(struct work_struct *work); static inline int hdr2sc(struct hfi1_message_header *hdr, u64 rhf) { return ((be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf) | - ((!!(rhf & RHF_DC_INFO_SMASK)) << 4); + ((!!(rhf_dc_info(rhf))) << 4); } static inline u16 generate_jkey(kuid_t uid) @@ -1569,6 +1579,22 @@ static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port) return &dd->pport[pidx].ibport_data; } +void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt, + bool do_cnp); +static inline bool process_ecn(struct rvt_qp *qp, struct hfi1_packet *pkt, + bool do_cnp) +{ + struct hfi1_other_headers *ohdr = pkt->ohdr; + u32 bth1; + + bth1 = be32_to_cpu(ohdr->bth[1]); + if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) { + hfi1_process_ecn_slowpath(qp, pkt, do_cnp); + return bth1 & HFI1_FECN_SMASK; + } + return false; +} + /* * Return the indexed PKEY from the port PKEY table. */ @@ -1586,8 +1612,7 @@ static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index) } /* - * Readers of cc_state must call get_cc_state() under rcu_read_lock(). - * Writers of cc_state must call get_cc_state() under cc_state_lock. + * Called by readers of cc_state only, must call under rcu_read_lock(). */ static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd) { @@ -1595,6 +1620,16 @@ static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd) } /* + * Called by writers of cc_state only, must call under cc_state_lock. + */ +static inline +struct cc_state *get_cc_state_protected(struct hfi1_pportdata *ppd) +{ + return rcu_dereference_protected(ppd->cc_state, + lockdep_is_held(&ppd->cc_state_lock)); +} + +/* * values for dd->flags (_device_ related flags) */ #define HFI1_INITTED 0x1 /* chip and driver up and initted */ @@ -1669,9 +1704,12 @@ void shutdown_led_override(struct hfi1_pportdata *ppd); */ #define DEFAULT_RCVHDR_ENTSIZE 32 -bool hfi1_can_pin_pages(struct hfi1_devdata *, u32, u32); -int hfi1_acquire_user_pages(unsigned long, size_t, bool, struct page **); -void hfi1_release_user_pages(struct mm_struct *, struct page **, size_t, bool); +bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm, + u32 nlocked, u32 npages); +int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, + size_t npages, bool writable, struct page **pages); +void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, + size_t npages, bool dirty); static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd) { @@ -1947,4 +1985,55 @@ static inline u32 qsfp_resource(struct hfi1_devdata *dd) int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp); +#define DD_DEV_ENTRY(dd) __string(dev, dev_name(&(dd)->pcidev->dev)) +#define DD_DEV_ASSIGN(dd) __assign_str(dev, dev_name(&(dd)->pcidev->dev)) + +#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype } +#define show_packettype(etype) \ +__print_symbolic(etype, \ + packettype_name(EXPECTED), \ + packettype_name(EAGER), \ + packettype_name(IB), \ + packettype_name(ERROR), \ + packettype_name(BYPASS)) + +#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode } +#define show_ib_opcode(opcode) \ +__print_symbolic(opcode, \ + ib_opcode_name(RC_SEND_FIRST), \ + ib_opcode_name(RC_SEND_MIDDLE), \ + ib_opcode_name(RC_SEND_LAST), \ + ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(RC_SEND_ONLY), \ + ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_WRITE_FIRST), \ + ib_opcode_name(RC_RDMA_WRITE_MIDDLE), \ + ib_opcode_name(RC_RDMA_WRITE_LAST), \ + ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_WRITE_ONLY), \ + ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_READ_REQUEST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY), \ + ib_opcode_name(RC_ACKNOWLEDGE), \ + ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \ + ib_opcode_name(RC_COMPARE_SWAP), \ + ib_opcode_name(RC_FETCH_ADD), \ + ib_opcode_name(UC_SEND_FIRST), \ + ib_opcode_name(UC_SEND_MIDDLE), \ + ib_opcode_name(UC_SEND_LAST), \ + ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(UC_SEND_ONLY), \ + ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(UC_RDMA_WRITE_FIRST), \ + ib_opcode_name(UC_RDMA_WRITE_MIDDLE), \ + ib_opcode_name(UC_RDMA_WRITE_LAST), \ + ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(UC_RDMA_WRITE_ONLY), \ + ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(UD_SEND_ONLY), \ + ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(CNP)) #endif /* _HFI1_KERNEL_H */ diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index eed971ccd2a1..a358d23ecd54 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -64,6 +64,7 @@ #include "debugfs.h" #include "verbs.h" #include "aspm.h" +#include "affinity.h" #undef pr_fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt @@ -474,8 +475,9 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t) void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, struct hfi1_devdata *dd, u8 hw_pidx, u8 port) { - int i, size; + int i; uint default_pkey_idx; + struct cc_state *cc_state; ppd->dd = dd; ppd->hw_pidx = hw_pidx; @@ -526,9 +528,9 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, spin_lock_init(&ppd->cc_state_lock); spin_lock_init(&ppd->cc_log_lock); - size = sizeof(struct cc_state); - RCU_INIT_POINTER(ppd->cc_state, kzalloc(size, GFP_KERNEL)); - if (!rcu_dereference(ppd->cc_state)) + cc_state = kzalloc(sizeof(*cc_state), GFP_KERNEL); + RCU_INIT_POINTER(ppd->cc_state, cc_state); + if (!cc_state) goto bail; return; @@ -972,39 +974,49 @@ void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) /* * Release our hold on the shared asic data. If we are the last one, - * free the structure. Must be holding hfi1_devs_lock. + * return the structure to be finalized outside the lock. Must be + * holding hfi1_devs_lock. */ -static void release_asic_data(struct hfi1_devdata *dd) +static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd) { + struct hfi1_asic_data *ad; int other; if (!dd->asic_data) - return; + return NULL; dd->asic_data->dds[dd->hfi1_id] = NULL; other = dd->hfi1_id ? 0 : 1; - if (!dd->asic_data->dds[other]) { - /* we are the last holder, free it */ - kfree(dd->asic_data); - } + ad = dd->asic_data; dd->asic_data = NULL; + /* return NULL if the other dd still has a link */ + return ad->dds[other] ? NULL : ad; +} + +static void finalize_asic_data(struct hfi1_devdata *dd, + struct hfi1_asic_data *ad) +{ + clean_up_i2c(dd, ad); + kfree(ad); } static void __hfi1_free_devdata(struct kobject *kobj) { struct hfi1_devdata *dd = container_of(kobj, struct hfi1_devdata, kobj); + struct hfi1_asic_data *ad; unsigned long flags; spin_lock_irqsave(&hfi1_devs_lock, flags); idr_remove(&hfi1_unit_table, dd->unit); list_del(&dd->list); - release_asic_data(dd); + ad = release_asic_data(dd); spin_unlock_irqrestore(&hfi1_devs_lock, flags); + if (ad) + finalize_asic_data(dd, ad); free_platform_config(dd); rcu_barrier(); /* wait for rcu callbacks to complete */ free_percpu(dd->int_counter); free_percpu(dd->rcv_limit); - hfi1_dev_affinity_free(dd); free_percpu(dd->send_schedule); rvt_dealloc_device(&dd->verbs_dev.rdi); } @@ -1162,7 +1174,7 @@ static int init_one(struct pci_dev *, const struct pci_device_id *); #define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: " #define PFX DRIVER_NAME ": " -static const struct pci_device_id hfi1_pci_tbl[] = { +const struct pci_device_id hfi1_pci_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) }, { 0, } @@ -1198,6 +1210,10 @@ static int __init hfi1_mod_init(void) if (ret) goto bail; + ret = node_affinity_init(); + if (ret) + goto bail; + /* validate max MTU before any devices start */ if (!valid_opa_max_mtu(hfi1_max_mtu)) { pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n", @@ -1278,6 +1294,7 @@ module_init(hfi1_mod_init); static void __exit hfi1_mod_cleanup(void) { pci_unregister_driver(&hfi1_pci_driver); + node_affinity_destroy(); hfi1_wss_exit(); hfi1_dbg_exit(); hfi1_cpulist_count = 0; @@ -1311,7 +1328,7 @@ static void cleanup_device_data(struct hfi1_devdata *dd) hrtimer_cancel(&ppd->cca_timer[i].hrtimer); spin_lock(&ppd->cc_state_lock); - cc_state = get_cc_state(ppd); + cc_state = get_cc_state_protected(ppd); RCU_INIT_POINTER(ppd->cc_state, NULL); spin_unlock(&ppd->cc_state_lock); @@ -1760,8 +1777,8 @@ int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) hfi1_cdbg(PROC, "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n", - rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size, - rcd->egrbufs.size); + rcd->ctxt, rcd->egrbufs.alloced, + rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024); /* * Set the contexts rcv array head update threshold to the closest diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index fca07a1d6c28..1263abe01999 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -588,7 +588,6 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, pi->port_phys_conf = (ppd->port_type & 0xf); -#if PI_LED_ENABLE_SUP pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4; pi->port_states.ledenable_offlinereason |= ppd->is_sm_config_started << 5; @@ -602,11 +601,6 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, pi->port_states.ledenable_offlinereason |= is_beaconing_active << 6; pi->port_states.ledenable_offlinereason |= ppd->offline_disabled_reason; -#else - pi->port_states.offline_reason = ppd->neighbor_normal << 4; - pi->port_states.offline_reason |= ppd->is_sm_config_started << 5; - pi->port_states.offline_reason |= ppd->offline_disabled_reason; -#endif /* PI_LED_ENABLE_SUP */ pi->port_states.portphysstate_portstate = (hfi1_ibphys_portstate(ppd) << 4) | state; @@ -1752,17 +1746,11 @@ static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data, if (start_of_sm_config && (lstate == IB_PORT_INIT)) ppd->is_sm_config_started = 1; -#if PI_LED_ENABLE_SUP psi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4; psi->port_states.ledenable_offlinereason |= ppd->is_sm_config_started << 5; psi->port_states.ledenable_offlinereason |= ppd->offline_disabled_reason; -#else - psi->port_states.offline_reason = ppd->neighbor_normal << 4; - psi->port_states.offline_reason |= ppd->is_sm_config_started << 5; - psi->port_states.offline_reason |= ppd->offline_disabled_reason; -#endif /* PI_LED_ENABLE_SUP */ psi->port_states.portphysstate_portstate = (hfi1_ibphys_portstate(ppd) << 4) | (lstate & 0xf); @@ -2430,14 +2418,9 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp, rsp->port_rcv_remote_physical_errors = cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL)); - tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL); - tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL); - if (tmp2 < tmp) { - /* overflow/wrapped */ - rsp->local_link_integrity_errors = cpu_to_be64(~0); - } else { - rsp->local_link_integrity_errors = cpu_to_be64(tmp2); - } + rsp->local_link_integrity_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RX_REPLAY, + CNTR_INVALID_VL)); tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL); tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL); @@ -2499,6 +2482,9 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp, cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL, idx_from_vl(vl))); + rsp->vls[vfi].port_vl_xmit_discards = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD_VL, + idx_from_vl(vl))); vlinfo++; vfi++; } @@ -2529,9 +2515,8 @@ static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port, error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL); /* local link integrity must be right-shifted by the lli resolution */ - tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL); - tmp += read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL); - error_counter_summary += (tmp >> res_lli); + error_counter_summary += (read_dev_cntr(dd, C_DC_RX_REPLAY, + CNTR_INVALID_VL) >> res_lli); /* link error recovery must b right-shifted by the ler resolution */ tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL); tmp += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL); @@ -2800,14 +2785,9 @@ static void pma_get_opa_port_ectrs(struct ib_device *ibdev, rsp->port_rcv_constraint_errors = cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR, CNTR_INVALID_VL)); - tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL); - tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL); - if (tmp2 < tmp) { - /* overflow/wrapped */ - rsp->local_link_integrity_errors = cpu_to_be64(~0); - } else { - rsp->local_link_integrity_errors = cpu_to_be64(tmp2); - } + rsp->local_link_integrity_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RX_REPLAY, + CNTR_INVALID_VL)); rsp->excessive_buffer_overruns = cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL)); } @@ -2883,14 +2863,17 @@ static int pma_get_opa_porterrors(struct opa_pma_mad *pmp, tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL); rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff; - + rsp->port_rcv_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL)); vlinfo = &rsp->vls[0]; vfi = 0; vl_select_mask = be32_to_cpu(req->vl_select_mask); for_each_set_bit(vl, (unsigned long *)&(vl_select_mask), 8 * sizeof(req->vl_select_mask)) { memset(vlinfo, 0, sizeof(*vlinfo)); - /* vlinfo->vls[vfi].port_vl_xmit_discards ??? */ + rsp->vls[vfi].port_vl_xmit_discards = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD_VL, + idx_from_vl(vl))); vlinfo += 1; vfi++; } @@ -3162,10 +3145,8 @@ static int pma_set_opa_portstatus(struct opa_pma_mad *pmp, if (counter_select & CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS) write_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL, 0); - if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS) { - write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0); + if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS) write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0); - } if (counter_select & CS_LINK_ERROR_RECOVERY) { write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0); @@ -3223,7 +3204,9 @@ static int pma_set_opa_portstatus(struct opa_pma_mad *pmp, /* if (counter_select & CS_PORT_MARK_FECN) * write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0); */ - /* port_vl_xmit_discards ??? */ + if (counter_select & C_SW_XMIT_DSCD_VL) + write_port_cntr(ppd, C_SW_XMIT_DSCD_VL, + idx_from_vl(vl), 0); } if (resp_len) @@ -3392,7 +3375,7 @@ static void apply_cc_state(struct hfi1_pportdata *ppd) */ spin_lock(&ppd->cc_state_lock); - old_cc_state = get_cc_state(ppd); + old_cc_state = get_cc_state_protected(ppd); if (!old_cc_state) { /* never active, or shutting down */ spin_unlock(&ppd->cc_state_lock); @@ -3960,7 +3943,6 @@ void clear_linkup_counters(struct hfi1_devdata *dd) write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0); write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL, 0); /* LocalLinkIntegrityErrors */ - write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0); write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0); /* ExcessiveBufferOverruns */ write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0); diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h index 8b734aaae88a..5aa3fd1be653 100644 --- a/drivers/infiniband/hw/hfi1/mad.h +++ b/drivers/infiniband/hw/hfi1/mad.h @@ -48,15 +48,8 @@ #define _HFI1_MAD_H #include <rdma/ib_pma.h> -#define USE_PI_LED_ENABLE 1 /* - * use led enabled bit in struct - * opa_port_states, if available - */ #include <rdma/opa_smi.h> #include <rdma/opa_port_info.h> -#ifndef PI_LED_ENABLE_SUP -#define PI_LED_ENABLE_SUP 0 -#endif #include "opa_compat.h" /* diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index b7a80aa1ae30..7ad30898fc19 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -53,19 +53,20 @@ #include "trace.h" struct mmu_rb_handler { - struct list_head list; struct mmu_notifier mn; - struct rb_root *root; + struct rb_root root; + void *ops_arg; spinlock_t lock; /* protect the RB tree */ struct mmu_rb_ops *ops; + struct mm_struct *mm; + struct list_head lru_list; + struct work_struct del_work; + struct list_head del_list; + struct workqueue_struct *wq; }; -static LIST_HEAD(mmu_rb_handlers); -static DEFINE_SPINLOCK(mmu_rb_lock); /* protect mmu_rb_handlers list */ - static unsigned long mmu_node_start(struct mmu_rb_node *); static unsigned long mmu_node_last(struct mmu_rb_node *); -static struct mmu_rb_handler *find_mmu_handler(struct rb_root *); static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *, unsigned long); static inline void mmu_notifier_range_start(struct mmu_notifier *, @@ -76,6 +77,9 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *, unsigned long, unsigned long); static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *, unsigned long, unsigned long); +static void do_remove(struct mmu_rb_handler *handler, + struct list_head *del_list); +static void handle_remove(struct work_struct *work); static struct mmu_notifier_ops mn_opts = { .invalidate_page = mmu_notifier_page, @@ -95,73 +99,79 @@ static unsigned long mmu_node_last(struct mmu_rb_node *node) return PAGE_ALIGN(node->addr + node->len) - 1; } -int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops) +int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm, + struct mmu_rb_ops *ops, + struct workqueue_struct *wq, + struct mmu_rb_handler **handler) { struct mmu_rb_handler *handlr; - - if (!ops->invalidate) - return -EINVAL; + int ret; handlr = kmalloc(sizeof(*handlr), GFP_KERNEL); if (!handlr) return -ENOMEM; - handlr->root = root; + handlr->root = RB_ROOT; handlr->ops = ops; + handlr->ops_arg = ops_arg; INIT_HLIST_NODE(&handlr->mn.hlist); spin_lock_init(&handlr->lock); handlr->mn.ops = &mn_opts; - spin_lock(&mmu_rb_lock); - list_add_tail_rcu(&handlr->list, &mmu_rb_handlers); - spin_unlock(&mmu_rb_lock); + handlr->mm = mm; + INIT_WORK(&handlr->del_work, handle_remove); + INIT_LIST_HEAD(&handlr->del_list); + INIT_LIST_HEAD(&handlr->lru_list); + handlr->wq = wq; + + ret = mmu_notifier_register(&handlr->mn, handlr->mm); + if (ret) { + kfree(handlr); + return ret; + } - return mmu_notifier_register(&handlr->mn, current->mm); + *handler = handlr; + return 0; } -void hfi1_mmu_rb_unregister(struct rb_root *root) +void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler) { - struct mmu_rb_handler *handler = find_mmu_handler(root); + struct mmu_rb_node *rbnode; + struct rb_node *node; unsigned long flags; - - if (!handler) - return; + struct list_head del_list; /* Unregister first so we don't get any more notifications. */ - if (current->mm) - mmu_notifier_unregister(&handler->mn, current->mm); + mmu_notifier_unregister(&handler->mn, handler->mm); - spin_lock(&mmu_rb_lock); - list_del_rcu(&handler->list); - spin_unlock(&mmu_rb_lock); - synchronize_rcu(); + /* + * Make sure the wq delete handler is finished running. It will not + * be triggered once the mmu notifiers are unregistered above. + */ + flush_work(&handler->del_work); + + INIT_LIST_HEAD(&del_list); spin_lock_irqsave(&handler->lock, flags); - if (!RB_EMPTY_ROOT(root)) { - struct rb_node *node; - struct mmu_rb_node *rbnode; - - while ((node = rb_first(root))) { - rbnode = rb_entry(node, struct mmu_rb_node, node); - rb_erase(node, root); - if (handler->ops->remove) - handler->ops->remove(root, rbnode, NULL); - } + while ((node = rb_first(&handler->root))) { + rbnode = rb_entry(node, struct mmu_rb_node, node); + rb_erase(node, &handler->root); + /* move from LRU list to delete list */ + list_move(&rbnode->list, &del_list); } spin_unlock_irqrestore(&handler->lock, flags); + do_remove(handler, &del_list); + kfree(handler); } -int hfi1_mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode) +int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, + struct mmu_rb_node *mnode) { - struct mmu_rb_handler *handler = find_mmu_handler(root); struct mmu_rb_node *node; unsigned long flags; int ret = 0; - if (!handler) - return -EINVAL; - spin_lock_irqsave(&handler->lock, flags); hfi1_cdbg(MMU, "Inserting node addr 0x%llx, len %u", mnode->addr, mnode->len); @@ -170,12 +180,13 @@ int hfi1_mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode) ret = -EINVAL; goto unlock; } - __mmu_int_rb_insert(mnode, root); + __mmu_int_rb_insert(mnode, &handler->root); + list_add(&mnode->list, &handler->lru_list); - if (handler->ops->insert) { - ret = handler->ops->insert(root, mnode); - if (ret) - __mmu_int_rb_remove(mnode, root); + ret = handler->ops->insert(handler->ops_arg, mnode); + if (ret) { + __mmu_int_rb_remove(mnode, &handler->root); + list_del(&mnode->list); /* remove from LRU list */ } unlock: spin_unlock_irqrestore(&handler->lock, flags); @@ -191,10 +202,10 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler, hfi1_cdbg(MMU, "Searching for addr 0x%llx, len %u", addr, len); if (!handler->ops->filter) { - node = __mmu_int_rb_iter_first(handler->root, addr, + node = __mmu_int_rb_iter_first(&handler->root, addr, (addr + len) - 1); } else { - for (node = __mmu_int_rb_iter_first(handler->root, addr, + for (node = __mmu_int_rb_iter_first(&handler->root, addr, (addr + len) - 1); node; node = __mmu_int_rb_iter_next(node, addr, @@ -206,82 +217,72 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler, return node; } -/* Caller must *not* hold handler lock. */ -static void __mmu_rb_remove(struct mmu_rb_handler *handler, - struct mmu_rb_node *node, struct mm_struct *mm) -{ - unsigned long flags; - - /* Validity of handler and node pointers has been checked by caller. */ - hfi1_cdbg(MMU, "Removing node addr 0x%llx, len %u", node->addr, - node->len); - spin_lock_irqsave(&handler->lock, flags); - __mmu_int_rb_remove(node, handler->root); - spin_unlock_irqrestore(&handler->lock, flags); - - if (handler->ops->remove) - handler->ops->remove(handler->root, node, mm); -} - -struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *root, unsigned long addr, - unsigned long len) +struct mmu_rb_node *hfi1_mmu_rb_extract(struct mmu_rb_handler *handler, + unsigned long addr, unsigned long len) { - struct mmu_rb_handler *handler = find_mmu_handler(root); struct mmu_rb_node *node; unsigned long flags; - if (!handler) - return ERR_PTR(-EINVAL); - spin_lock_irqsave(&handler->lock, flags); node = __mmu_rb_search(handler, addr, len); + if (node) { + __mmu_int_rb_remove(node, &handler->root); + list_del(&node->list); /* remove from LRU list */ + } spin_unlock_irqrestore(&handler->lock, flags); return node; } -struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root, - unsigned long addr, unsigned long len) +void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) { - struct mmu_rb_handler *handler = find_mmu_handler(root); - struct mmu_rb_node *node; + struct mmu_rb_node *rbnode, *ptr; + struct list_head del_list; unsigned long flags; + bool stop = false; - if (!handler) - return ERR_PTR(-EINVAL); + INIT_LIST_HEAD(&del_list); spin_lock_irqsave(&handler->lock, flags); - node = __mmu_rb_search(handler, addr, len); - if (node) - __mmu_int_rb_remove(node, handler->root); + list_for_each_entry_safe_reverse(rbnode, ptr, &handler->lru_list, + list) { + if (handler->ops->evict(handler->ops_arg, rbnode, evict_arg, + &stop)) { + __mmu_int_rb_remove(rbnode, &handler->root); + /* move from LRU list to delete list */ + list_move(&rbnode->list, &del_list); + } + if (stop) + break; + } spin_unlock_irqrestore(&handler->lock, flags); - return node; + while (!list_empty(&del_list)) { + rbnode = list_first_entry(&del_list, struct mmu_rb_node, list); + list_del(&rbnode->list); + handler->ops->remove(handler->ops_arg, rbnode); + } } -void hfi1_mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node) +/* + * It is up to the caller to ensure that this function does not race with the + * mmu invalidate notifier which may be calling the users remove callback on + * 'node'. + */ +void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler, + struct mmu_rb_node *node) { - struct mmu_rb_handler *handler = find_mmu_handler(root); - - if (!handler || !node) - return; - - __mmu_rb_remove(handler, node, NULL); -} + unsigned long flags; -static struct mmu_rb_handler *find_mmu_handler(struct rb_root *root) -{ - struct mmu_rb_handler *handler; + /* Validity of handler and node pointers has been checked by caller. */ + hfi1_cdbg(MMU, "Removing node addr 0x%llx, len %u", node->addr, + node->len); + spin_lock_irqsave(&handler->lock, flags); + __mmu_int_rb_remove(node, &handler->root); + list_del(&node->list); /* remove from LRU list */ + spin_unlock_irqrestore(&handler->lock, flags); - rcu_read_lock(); - list_for_each_entry_rcu(handler, &mmu_rb_handlers, list) { - if (handler->root == root) - goto unlock; - } - handler = NULL; -unlock: - rcu_read_unlock(); - return handler; + handler->ops->remove(handler->ops_arg, node); } static inline void mmu_notifier_page(struct mmu_notifier *mn, @@ -304,9 +305,10 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn, { struct mmu_rb_handler *handler = container_of(mn, struct mmu_rb_handler, mn); - struct rb_root *root = handler->root; + struct rb_root *root = &handler->root; struct mmu_rb_node *node, *ptr = NULL; unsigned long flags; + bool added = false; spin_lock_irqsave(&handler->lock, flags); for (node = __mmu_int_rb_iter_first(root, start, end - 1); @@ -315,11 +317,53 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn, ptr = __mmu_int_rb_iter_next(node, start, end - 1); hfi1_cdbg(MMU, "Invalidating node addr 0x%llx, len %u", node->addr, node->len); - if (handler->ops->invalidate(root, node)) { + if (handler->ops->invalidate(handler->ops_arg, node)) { __mmu_int_rb_remove(node, root); - if (handler->ops->remove) - handler->ops->remove(root, node, mm); + /* move from LRU list to delete list */ + list_move(&node->list, &handler->del_list); + added = true; } } spin_unlock_irqrestore(&handler->lock, flags); + + if (added) + queue_work(handler->wq, &handler->del_work); +} + +/* + * Call the remove function for the given handler and the list. This + * is expected to be called with a delete list extracted from handler. + * The caller should not be holding the handler lock. + */ +static void do_remove(struct mmu_rb_handler *handler, + struct list_head *del_list) +{ + struct mmu_rb_node *node; + + while (!list_empty(del_list)) { + node = list_first_entry(del_list, struct mmu_rb_node, list); + list_del(&node->list); + handler->ops->remove(handler->ops_arg, node); + } +} + +/* + * Work queue function to remove all nodes that have been queued up to + * be removed. The key feature is that mm->mmap_sem is not being held + * and the remove callback can sleep while taking it, if needed. + */ +static void handle_remove(struct work_struct *work) +{ + struct mmu_rb_handler *handler = container_of(work, + struct mmu_rb_handler, + del_work); + struct list_head del_list; + unsigned long flags; + + /* remove anything that is queued to get removed */ + spin_lock_irqsave(&handler->lock, flags); + list_replace_init(&handler->del_list, &del_list); + spin_unlock_irqrestore(&handler->lock, flags); + + do_remove(handler, &del_list); } diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h index 7a57b9c49d27..754f6ebf13fb 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.h +++ b/drivers/infiniband/hw/hfi1/mmu_rb.h @@ -54,23 +54,34 @@ struct mmu_rb_node { unsigned long len; unsigned long __last; struct rb_node node; + struct list_head list; }; +/* + * NOTE: filter, insert, invalidate, and evict must not sleep. Only remove is + * allowed to sleep. + */ struct mmu_rb_ops { - bool (*filter)(struct mmu_rb_node *, unsigned long, unsigned long); - int (*insert)(struct rb_root *, struct mmu_rb_node *); - void (*remove)(struct rb_root *, struct mmu_rb_node *, - struct mm_struct *); - int (*invalidate)(struct rb_root *, struct mmu_rb_node *); + bool (*filter)(struct mmu_rb_node *node, unsigned long addr, + unsigned long len); + int (*insert)(void *ops_arg, struct mmu_rb_node *mnode); + void (*remove)(void *ops_arg, struct mmu_rb_node *mnode); + int (*invalidate)(void *ops_arg, struct mmu_rb_node *node); + int (*evict)(void *ops_arg, struct mmu_rb_node *mnode, + void *evict_arg, bool *stop); }; -int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops); -void hfi1_mmu_rb_unregister(struct rb_root *); -int hfi1_mmu_rb_insert(struct rb_root *, struct mmu_rb_node *); -void hfi1_mmu_rb_remove(struct rb_root *, struct mmu_rb_node *); -struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *, unsigned long, - unsigned long); -struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *, unsigned long, - unsigned long); +int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm, + struct mmu_rb_ops *ops, + struct workqueue_struct *wq, + struct mmu_rb_handler **handler); +void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler); +int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, + struct mmu_rb_node *mnode); +void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg); +void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler, + struct mmu_rb_node *mnode); +struct mmu_rb_node *hfi1_mmu_rb_extract(struct mmu_rb_handler *handler, + unsigned long addr, unsigned long len); #endif /* _HFI1_MMU_RB_H */ diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index 0bac21e6a658..89c68da1c273 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -679,6 +679,10 @@ static uint pcie_pset = UNSET_PSET; module_param(pcie_pset, uint, S_IRUGO); MODULE_PARM_DESC(pcie_pset, "PCIe Eq Pset value to use, range is 0-10"); +static uint pcie_ctle = 1; /* discrete on, integrated off */ +module_param(pcie_ctle, uint, S_IRUGO); +MODULE_PARM_DESC(pcie_ctle, "PCIe static CTLE mode, bit 0 - discrete on/off, bit 1 - integrated on/off"); + /* equalization columns */ #define PREC 0 #define ATTN 1 @@ -716,6 +720,36 @@ static const u8 integrated_preliminary_eq[11][3] = { { 0x00, 0x1e, 0x0a }, /* p10 */ }; +static const u8 discrete_ctle_tunings[11][4] = { + /* DC LF HF BW */ + { 0x48, 0x0b, 0x04, 0x04 }, /* p0 */ + { 0x60, 0x05, 0x0f, 0x0a }, /* p1 */ + { 0x50, 0x09, 0x06, 0x06 }, /* p2 */ + { 0x68, 0x05, 0x0f, 0x0a }, /* p3 */ + { 0x80, 0x05, 0x0f, 0x0a }, /* p4 */ + { 0x70, 0x05, 0x0f, 0x0a }, /* p5 */ + { 0x68, 0x05, 0x0f, 0x0a }, /* p6 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p7 */ + { 0x48, 0x09, 0x06, 0x06 }, /* p8 */ + { 0x60, 0x05, 0x0f, 0x0a }, /* p9 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p10 */ +}; + +static const u8 integrated_ctle_tunings[11][4] = { + /* DC LF HF BW */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p0 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p1 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p2 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p3 */ + { 0x58, 0x0a, 0x05, 0x05 }, /* p4 */ + { 0x48, 0x0a, 0x05, 0x05 }, /* p5 */ + { 0x40, 0x0a, 0x05, 0x05 }, /* p6 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p7 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p8 */ + { 0x38, 0x09, 0x06, 0x06 }, /* p9 */ + { 0x38, 0x0e, 0x01, 0x01 }, /* p10 */ +}; + /* helper to format the value to write to hardware */ #define eq_value(pre, curr, post) \ ((((u32)(pre)) << \ @@ -951,11 +985,14 @@ int do_pcie_gen3_transition(struct hfi1_devdata *dd) u32 status, err; int ret; int do_retry, retry_count = 0; + int intnum = 0; uint default_pset; u16 target_vector, target_speed; u16 lnkctl2, vendor; u8 div; const u8 (*eq)[3]; + const u8 (*ctle_tunings)[4]; + uint static_ctle_mode; int return_error = 0; /* PCIe Gen3 is for the ASIC only */ @@ -1089,6 +1126,9 @@ retry: div = 3; eq = discrete_preliminary_eq; default_pset = DEFAULT_DISCRETE_PSET; + ctle_tunings = discrete_ctle_tunings; + /* bit 0 - discrete on/off */ + static_ctle_mode = pcie_ctle & 0x1; } else { /* 400mV, FS=29, LF = 9 */ fs = 29; @@ -1096,6 +1136,9 @@ retry: div = 1; eq = integrated_preliminary_eq; default_pset = DEFAULT_MCP_PSET; + ctle_tunings = integrated_ctle_tunings; + /* bit 1 - integrated on/off */ + static_ctle_mode = (pcie_ctle >> 1) & 0x1; } pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101, (fs << @@ -1135,16 +1178,33 @@ retry: * step 5c: Program gasket interrupts */ /* set the Rx Bit Rate to REFCLK ratio */ - write_gasket_interrupt(dd, 0, 0x0006, 0x0050); + write_gasket_interrupt(dd, intnum++, 0x0006, 0x0050); /* disable pCal for PCIe Gen3 RX equalization */ - write_gasket_interrupt(dd, 1, 0x0026, 0x5b01); + /* select adaptive or static CTLE */ + write_gasket_interrupt(dd, intnum++, 0x0026, + 0x5b01 | (static_ctle_mode << 3)); /* * Enable iCal for PCIe Gen3 RX equalization, and set which * evaluation of RX_EQ_EVAL will launch the iCal procedure. */ - write_gasket_interrupt(dd, 2, 0x0026, 0x5202); + write_gasket_interrupt(dd, intnum++, 0x0026, 0x5202); + + if (static_ctle_mode) { + /* apply static CTLE tunings */ + u8 pcie_dc, pcie_lf, pcie_hf, pcie_bw; + + pcie_dc = ctle_tunings[pcie_pset][0]; + pcie_lf = ctle_tunings[pcie_pset][1]; + pcie_hf = ctle_tunings[pcie_pset][2]; + pcie_bw = ctle_tunings[pcie_pset][3]; + write_gasket_interrupt(dd, intnum++, 0x0026, 0x0200 | pcie_dc); + write_gasket_interrupt(dd, intnum++, 0x0026, 0x0100 | pcie_lf); + write_gasket_interrupt(dd, intnum++, 0x0026, 0x0000 | pcie_hf); + write_gasket_interrupt(dd, intnum++, 0x0026, 0x5500 | pcie_bw); + } + /* terminate list */ - write_gasket_interrupt(dd, 3, 0x0000, 0x0000); + write_gasket_interrupt(dd, intnum++, 0x0000, 0x0000); /* * step 5d: program XMT margin diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index d4022450b73f..ac1bf4a73571 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -1952,13 +1952,17 @@ int init_pervl_scs(struct hfi1_devdata *dd) dd->vld[15].sc = sc_alloc(dd, SC_VL15, dd->rcd[0]->rcvhdrqentsize, dd->node); if (!dd->vld[15].sc) - goto nomem; + return -ENOMEM; + hfi1_init_ctxt(dd->vld[15].sc); dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048); - dd->kernel_send_context = kmalloc_node(dd->num_send_contexts * + dd->kernel_send_context = kzalloc_node(dd->num_send_contexts * sizeof(struct send_context *), GFP_KERNEL, dd->node); + if (!dd->kernel_send_context) + goto freesc15; + dd->kernel_send_context[0] = dd->vld[15].sc; for (i = 0; i < num_vls; i++) { @@ -2010,12 +2014,21 @@ int init_pervl_scs(struct hfi1_devdata *dd) if (pio_map_init(dd, ppd->port - 1, num_vls, NULL)) goto nomem; return 0; + nomem: - sc_free(dd->vld[15].sc); - for (i = 0; i < num_vls; i++) + for (i = 0; i < num_vls; i++) { sc_free(dd->vld[i].sc); + dd->vld[i].sc = NULL; + } + for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) sc_free(dd->kernel_send_context[i + 1]); + + kfree(dd->kernel_send_context); + dd->kernel_send_context = NULL; + +freesc15: + sc_free(dd->vld[15].sc); return -ENOMEM; } diff --git a/drivers/infiniband/hw/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c index 03df9322f862..965c8aef0c60 100644 --- a/drivers/infiniband/hw/hfi1/platform.c +++ b/drivers/infiniband/hw/hfi1/platform.c @@ -537,20 +537,6 @@ static void apply_tunings( u8 precur = 0, attn = 0, postcur = 0, external_device_config = 0; u8 *cache = ppd->qsfp_info.cache; - /* Enable external device config if channel is limiting active */ - read_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS, - GENERAL_CONFIG, &config_data); - config_data &= ~(0xff << ENABLE_EXT_DEV_CONFIG_SHIFT); - config_data |= ((u32)limiting_active << ENABLE_EXT_DEV_CONFIG_SHIFT); - ret = load_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS, - GENERAL_CONFIG, config_data); - if (ret != HCMD_SUCCESS) - dd_dev_err( - ppd->dd, - "%s: Failed to set enable external device config\n", - __func__); - - config_data = 0; /* re-init */ /* Pass tuning method to 8051 */ read_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG, &config_data); @@ -638,9 +624,13 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, if (ret) return ret; + /* + * We'll change the QSFP memory contents from here on out, thus we set a + * flag here to remind ourselves to reset the QSFP module. This prevents + * reuse of stale settings established in our previous pass through. + */ if (ppd->qsfp_info.reset_needed) { reset_qsfp(ppd); - ppd->qsfp_info.reset_needed = 0; refresh_qsfp_cache(ppd, &ppd->qsfp_info); } else { ppd->qsfp_info.reset_needed = 1; diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 1a942ffba4cb..a5aa3517e7d5 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -52,6 +52,7 @@ #include <linux/seq_file.h> #include <rdma/rdma_vt.h> #include <rdma/rdmavt_qp.h> +#include <rdma/ib_verbs.h> #include "hfi.h" #include "qp.h" @@ -115,6 +116,66 @@ static const u16 credit_table[31] = { 32768 /* 1E */ }; +const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = { +[IB_WR_RDMA_WRITE] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +[IB_WR_RDMA_READ] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_ATOMIC, +}, + +[IB_WR_ATOMIC_CMP_AND_SWP] = { + .length = sizeof(struct ib_atomic_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE, +}, + +[IB_WR_ATOMIC_FETCH_AND_ADD] = { + .length = sizeof(struct ib_atomic_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE, +}, + +[IB_WR_RDMA_WRITE_WITH_IMM] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +[IB_WR_SEND] = { + .length = sizeof(struct ib_send_wr), + .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) | + BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +[IB_WR_SEND_WITH_IMM] = { + .length = sizeof(struct ib_send_wr), + .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) | + BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +[IB_WR_REG_MR] = { + .length = sizeof(struct ib_reg_wr), + .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC), + .flags = RVT_OPERATION_LOCAL, +}, + +[IB_WR_LOCAL_INV] = { + .length = sizeof(struct ib_send_wr), + .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC), + .flags = RVT_OPERATION_LOCAL, +}, + +[IB_WR_SEND_WITH_INV] = { + .length = sizeof(struct ib_send_wr), + .qpt_support = BIT(IB_QPT_RC), +}, + +}; + static void flush_tx_list(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; @@ -745,8 +806,9 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp, priv->owner = qp; - priv->s_hdr = kzalloc_node(sizeof(*priv->s_hdr), gfp, rdi->dparms.node); - if (!priv->s_hdr) { + priv->s_ahg = kzalloc_node(sizeof(*priv->s_ahg), gfp, + rdi->dparms.node); + if (!priv->s_ahg) { kfree(priv); return ERR_PTR(-ENOMEM); } @@ -759,7 +821,7 @@ void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; - kfree(priv->s_hdr); + kfree(priv->s_ahg); kfree(priv); } diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index e7bc8d6cf681..587d84d65bb8 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -54,6 +54,8 @@ extern unsigned int hfi1_qp_table_size; +extern const struct rvt_operation_params hfi1_post_parms[]; + /* * free_ahg - clear ahg from QP */ @@ -61,7 +63,7 @@ static inline void clear_ahg(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; - priv->s_hdr->ahgcount = 0; + priv->s_ahg->ahgcount = 0; qp->s_flags &= ~(RVT_S_AHG_VALID | RVT_S_AHG_CLEAR); if (priv->s_sde && qp->s_ahgidx >= 0) sdma_ahg_free(priv->s_sde, qp->s_ahgidx); diff --git a/drivers/infiniband/hw/hfi1/qsfp.c b/drivers/infiniband/hw/hfi1/qsfp.c index 9fb561682c66..a207717ade2a 100644 --- a/drivers/infiniband/hw/hfi1/qsfp.c +++ b/drivers/infiniband/hw/hfi1/qsfp.c @@ -50,46 +50,285 @@ #include <linux/vmalloc.h> #include "hfi.h" -#include "twsi.h" + +/* for the given bus number, return the CSR for reading an i2c line */ +static inline u32 i2c_in_csr(u32 bus_num) +{ + return bus_num ? ASIC_QSFP2_IN : ASIC_QSFP1_IN; +} + +/* for the given bus number, return the CSR for writing an i2c line */ +static inline u32 i2c_oe_csr(u32 bus_num) +{ + return bus_num ? ASIC_QSFP2_OE : ASIC_QSFP1_OE; +} + +static void hfi1_setsda(void *data, int state) +{ + struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data; + struct hfi1_devdata *dd = bus->controlling_dd; + u64 reg; + u32 target_oe; + + target_oe = i2c_oe_csr(bus->num); + reg = read_csr(dd, target_oe); + /* + * The OE bit value is inverted and connected to the pin. When + * OE is 0 the pin is left to be pulled up, when the OE is 1 + * the pin is driven low. This matches the "open drain" or "open + * collector" convention. + */ + if (state) + reg &= ~QSFP_HFI0_I2CDAT; + else + reg |= QSFP_HFI0_I2CDAT; + write_csr(dd, target_oe, reg); + /* do a read to force the write into the chip */ + (void)read_csr(dd, target_oe); +} + +static void hfi1_setscl(void *data, int state) +{ + struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data; + struct hfi1_devdata *dd = bus->controlling_dd; + u64 reg; + u32 target_oe; + + target_oe = i2c_oe_csr(bus->num); + reg = read_csr(dd, target_oe); + /* + * The OE bit value is inverted and connected to the pin. When + * OE is 0 the pin is left to be pulled up, when the OE is 1 + * the pin is driven low. This matches the "open drain" or "open + * collector" convention. + */ + if (state) + reg &= ~QSFP_HFI0_I2CCLK; + else + reg |= QSFP_HFI0_I2CCLK; + write_csr(dd, target_oe, reg); + /* do a read to force the write into the chip */ + (void)read_csr(dd, target_oe); +} + +static int hfi1_getsda(void *data) +{ + struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data; + u64 reg; + u32 target_in; + + hfi1_setsda(data, 1); /* clear OE so we do not pull line down */ + udelay(2); /* 1us pull up + 250ns hold */ + + target_in = i2c_in_csr(bus->num); + reg = read_csr(bus->controlling_dd, target_in); + return !!(reg & QSFP_HFI0_I2CDAT); +} + +static int hfi1_getscl(void *data) +{ + struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data; + u64 reg; + u32 target_in; + + hfi1_setscl(data, 1); /* clear OE so we do not pull line down */ + udelay(2); /* 1us pull up + 250ns hold */ + + target_in = i2c_in_csr(bus->num); + reg = read_csr(bus->controlling_dd, target_in); + return !!(reg & QSFP_HFI0_I2CCLK); +} /* - * QSFP support for hfi driver, using "Two Wire Serial Interface" driver - * in twsi.c + * Allocate and initialize the given i2c bus number. + * Returns NULL on failure. */ -#define I2C_MAX_RETRY 4 +static struct hfi1_i2c_bus *init_i2c_bus(struct hfi1_devdata *dd, + struct hfi1_asic_data *ad, int num) +{ + struct hfi1_i2c_bus *bus; + int ret; + + bus = kzalloc(sizeof(*bus), GFP_KERNEL); + if (!bus) + return NULL; + + bus->controlling_dd = dd; + bus->num = num; /* our bus number */ + + bus->algo.setsda = hfi1_setsda; + bus->algo.setscl = hfi1_setscl; + bus->algo.getsda = hfi1_getsda; + bus->algo.getscl = hfi1_getscl; + bus->algo.udelay = 5; + bus->algo.timeout = usecs_to_jiffies(50); + bus->algo.data = bus; + + bus->adapter.owner = THIS_MODULE; + bus->adapter.algo_data = &bus->algo; + bus->adapter.dev.parent = &dd->pcidev->dev; + snprintf(bus->adapter.name, sizeof(bus->adapter.name), + "hfi1_i2c%d", num); + + ret = i2c_bit_add_bus(&bus->adapter); + if (ret) { + dd_dev_info(dd, "%s: unable to add i2c bus %d, err %d\n", + __func__, num, ret); + kfree(bus); + return NULL; + } + + return bus; +} /* - * Raw i2c write. No set-up or lock checking. + * Initialize i2c buses. + * Return 0 on success, -errno on error. */ -static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, - int offset, void *bp, int len) +int set_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad) { - struct hfi1_devdata *dd = ppd->dd; - int ret, cnt; - u8 *buff = bp; + ad->i2c_bus0 = init_i2c_bus(dd, ad, 0); + ad->i2c_bus1 = init_i2c_bus(dd, ad, 1); + if (!ad->i2c_bus0 || !ad->i2c_bus1) + return -ENOMEM; + return 0; +}; - cnt = 0; - while (cnt < len) { - int wlen = len - cnt; +static void clean_i2c_bus(struct hfi1_i2c_bus *bus) +{ + if (bus) { + i2c_del_adapter(&bus->adapter); + kfree(bus); + } +} - ret = hfi1_twsi_blk_wr(dd, target, i2c_addr, offset, - buff + cnt, wlen); - if (ret) { - /* hfi1_twsi_blk_wr() 1 for error, else 0 */ - return -EIO; - } - offset += wlen; - cnt += wlen; +void clean_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad) +{ + clean_i2c_bus(ad->i2c_bus0); + ad->i2c_bus0 = NULL; + clean_i2c_bus(ad->i2c_bus1); + ad->i2c_bus1 = NULL; +} + +static int i2c_bus_write(struct hfi1_devdata *dd, struct hfi1_i2c_bus *i2c, + u8 slave_addr, int offset, int offset_size, + u8 *data, u16 len) +{ + int ret; + int num_msgs; + u8 offset_bytes[2]; + struct i2c_msg msgs[2]; + + switch (offset_size) { + case 0: + num_msgs = 1; + msgs[0].addr = slave_addr; + msgs[0].flags = 0; + msgs[0].len = len; + msgs[0].buf = data; + break; + case 2: + offset_bytes[1] = (offset >> 8) & 0xff; + /* fall through */ + case 1: + num_msgs = 2; + offset_bytes[0] = offset & 0xff; + + msgs[0].addr = slave_addr; + msgs[0].flags = 0; + msgs[0].len = offset_size; + msgs[0].buf = offset_bytes; + + msgs[1].addr = slave_addr; + msgs[1].flags = I2C_M_NOSTART, + msgs[1].len = len; + msgs[1].buf = data; + break; + default: + return -EINVAL; } - /* Must wait min 20us between qsfp i2c transactions */ - udelay(20); + i2c->controlling_dd = dd; + ret = i2c_transfer(&i2c->adapter, msgs, num_msgs); + if (ret != num_msgs) { + dd_dev_err(dd, "%s: bus %d, i2c slave 0x%x, offset 0x%x, len 0x%x; write failed, ret %d\n", + __func__, i2c->num, slave_addr, offset, len, ret); + return ret < 0 ? ret : -EIO; + } + return 0; +} + +static int i2c_bus_read(struct hfi1_devdata *dd, struct hfi1_i2c_bus *bus, + u8 slave_addr, int offset, int offset_size, + u8 *data, u16 len) +{ + int ret; + int num_msgs; + u8 offset_bytes[2]; + struct i2c_msg msgs[2]; + + switch (offset_size) { + case 0: + num_msgs = 1; + msgs[0].addr = slave_addr; + msgs[0].flags = I2C_M_RD; + msgs[0].len = len; + msgs[0].buf = data; + break; + case 2: + offset_bytes[1] = (offset >> 8) & 0xff; + /* fall through */ + case 1: + num_msgs = 2; + offset_bytes[0] = offset & 0xff; + + msgs[0].addr = slave_addr; + msgs[0].flags = 0; + msgs[0].len = offset_size; + msgs[0].buf = offset_bytes; + + msgs[1].addr = slave_addr; + msgs[1].flags = I2C_M_RD, + msgs[1].len = len; + msgs[1].buf = data; + break; + default: + return -EINVAL; + } - return cnt; + bus->controlling_dd = dd; + ret = i2c_transfer(&bus->adapter, msgs, num_msgs); + if (ret != num_msgs) { + dd_dev_err(dd, "%s: bus %d, i2c slave 0x%x, offset 0x%x, len 0x%x; read failed, ret %d\n", + __func__, bus->num, slave_addr, offset, len, ret); + return ret < 0 ? ret : -EIO; + } + return 0; +} + +/* + * Raw i2c write. No set-up or lock checking. + * + * Return 0 on success, -errno on error. + */ +static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, + int offset, void *bp, int len) +{ + struct hfi1_devdata *dd = ppd->dd; + struct hfi1_i2c_bus *bus; + u8 slave_addr; + int offset_size; + + bus = target ? dd->asic_data->i2c_bus1 : dd->asic_data->i2c_bus0; + slave_addr = (i2c_addr & 0xff) >> 1; /* convert to 7-bit addr */ + offset_size = (i2c_addr >> 8) & 0x3; + return i2c_bus_write(dd, bus, slave_addr, offset, offset_size, bp, len); } /* * Caller must hold the i2c chain resource. + * + * Return number of bytes written, or -errno. */ int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, void *bp, int len) @@ -99,63 +338,36 @@ int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) return -EACCES; - /* make sure the TWSI bus is in a sane state */ - ret = hfi1_twsi_reset(ppd->dd, target); - if (ret) { - hfi1_dev_porterr(ppd->dd, ppd->port, - "I2C chain %d write interface reset failed\n", - target); + ret = __i2c_write(ppd, target, i2c_addr, offset, bp, len); + if (ret) return ret; - } - return __i2c_write(ppd, target, i2c_addr, offset, bp, len); + return len; } /* * Raw i2c read. No set-up or lock checking. + * + * Return 0 on success, -errno on error. */ static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, void *bp, int len) { struct hfi1_devdata *dd = ppd->dd; - int ret, cnt, pass = 0; - int orig_offset = offset; - - cnt = 0; - while (cnt < len) { - int rlen = len - cnt; - - ret = hfi1_twsi_blk_rd(dd, target, i2c_addr, offset, - bp + cnt, rlen); - /* Some QSFP's fail first try. Retry as experiment */ - if (ret && cnt == 0 && ++pass < I2C_MAX_RETRY) - continue; - if (ret) { - /* hfi1_twsi_blk_rd() 1 for error, else 0 */ - ret = -EIO; - goto exit; - } - offset += rlen; - cnt += rlen; - } - - ret = cnt; - -exit: - if (ret < 0) { - hfi1_dev_porterr(dd, ppd->port, - "I2C chain %d read failed, addr 0x%x, offset 0x%x, len %d\n", - target, i2c_addr, orig_offset, len); - } - - /* Must wait min 20us between qsfp i2c transactions */ - udelay(20); - - return ret; + struct hfi1_i2c_bus *bus; + u8 slave_addr; + int offset_size; + + bus = target ? dd->asic_data->i2c_bus1 : dd->asic_data->i2c_bus0; + slave_addr = (i2c_addr & 0xff) >> 1; /* convert to 7-bit addr */ + offset_size = (i2c_addr >> 8) & 0x3; + return i2c_bus_read(dd, bus, slave_addr, offset, offset_size, bp, len); } /* * Caller must hold the i2c chain resource. + * + * Return number of bytes read, or -errno. */ int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, void *bp, int len) @@ -165,16 +377,11 @@ int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) return -EACCES; - /* make sure the TWSI bus is in a sane state */ - ret = hfi1_twsi_reset(ppd->dd, target); - if (ret) { - hfi1_dev_porterr(ppd->dd, ppd->port, - "I2C chain %d read interface reset failed\n", - target); + ret = __i2c_read(ppd, target, i2c_addr, offset, bp, len); + if (ret) return ret; - } - return __i2c_read(ppd, target, i2c_addr, offset, bp, len); + return len; } /* @@ -182,6 +389,8 @@ int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, * by writing @addr = ((256 * n) + m) * * Caller must hold the i2c chain resource. + * + * Return number of bytes written or -errno. */ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len) @@ -189,21 +398,12 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int count = 0; int offset; int nwrite; - int ret; + int ret = 0; u8 page; if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) return -EACCES; - /* make sure the TWSI bus is in a sane state */ - ret = hfi1_twsi_reset(ppd->dd, target); - if (ret) { - hfi1_dev_porterr(ppd->dd, ppd->port, - "QSFP chain %d write interface reset failed\n", - target); - return ret; - } - while (count < len) { /* * Set the qsfp page based on a zero-based address @@ -213,11 +413,12 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1); - if (ret != 1) { + /* QSFPs require a 5-10msec delay after write operations */ + mdelay(5); + if (ret) { hfi1_dev_porterr(ppd->dd, ppd->port, "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n", target, ret); - ret = -EIO; break; } @@ -229,11 +430,13 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, offset, bp + count, nwrite); - if (ret <= 0) /* stop on error or nothing written */ + /* QSFPs require a 5-10msec delay after write operations */ + mdelay(5); + if (ret) /* stop on error */ break; - count += ret; - addr += ret; + count += nwrite; + addr += nwrite; } if (ret < 0) @@ -243,7 +446,7 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, /* * Perform a stand-alone single QSFP write. Acquire the resource, do the - * read, then release the resource. + * write, then release the resource. */ int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len) @@ -266,6 +469,8 @@ int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, * by reading @addr = ((256 * n) + m) * * Caller must hold the i2c chain resource. + * + * Return the number of bytes read or -errno. */ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len) @@ -273,21 +478,12 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int count = 0; int offset; int nread; - int ret; + int ret = 0; u8 page; if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) return -EACCES; - /* make sure the TWSI bus is in a sane state */ - ret = hfi1_twsi_reset(ppd->dd, target); - if (ret) { - hfi1_dev_porterr(ppd->dd, ppd->port, - "QSFP chain %d read interface reset failed\n", - target); - return ret; - } - while (count < len) { /* * Set the qsfp page based on a zero-based address @@ -296,11 +492,12 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, page = (u8)(addr / QSFP_PAGESIZE); ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1); - if (ret != 1) { + /* QSFPs require a 5-10msec delay after write operations */ + mdelay(5); + if (ret) { hfi1_dev_porterr(ppd->dd, ppd->port, "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n", target, ret); - ret = -EIO; break; } @@ -310,15 +507,13 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, if (((addr % QSFP_RW_BOUNDARY) + nread) > QSFP_RW_BOUNDARY) nread = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY); - /* QSFPs require a 5-10msec delay after write operations */ - mdelay(5); ret = __i2c_read(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, offset, bp + count, nread); - if (ret <= 0) /* stop on error or nothing read */ + if (ret) /* stop on error */ break; - count += ret; - addr += ret; + count += nread; + addr += nread; } if (ret < 0) diff --git a/drivers/infiniband/hw/hfi1/qsfp.h b/drivers/infiniband/hw/hfi1/qsfp.h index dadc66c442b9..69275ebd9597 100644 --- a/drivers/infiniband/hw/hfi1/qsfp.h +++ b/drivers/infiniband/hw/hfi1/qsfp.h @@ -238,3 +238,6 @@ int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len); int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len); +struct hfi1_asic_data; +int set_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad); +void clean_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad); diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 792f15eb8efe..5da190e6011b 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -477,6 +477,37 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) qp->s_flags |= RVT_S_WAIT_FENCE; goto bail; } + /* + * Local operations are processed immediately + * after all prior requests have completed + */ + if (wqe->wr.opcode == IB_WR_REG_MR || + wqe->wr.opcode == IB_WR_LOCAL_INV) { + int local_ops = 0; + int err = 0; + + if (qp->s_last != qp->s_cur) + goto bail; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + if (++qp->s_tail == qp->s_size) + qp->s_tail = 0; + if (!(wqe->wr.send_flags & + RVT_SEND_COMPLETION_ONLY)) { + err = rvt_invalidate_rkey( + qp, + wqe->wr.ex.invalidate_rkey); + local_ops = 1; + } + hfi1_send_complete(qp, wqe, + err ? IB_WC_LOC_PROT_ERR + : IB_WC_SUCCESS); + if (local_ops) + atomic_dec(&qp->local_ops_pending); + qp->s_hdrwords = 0; + goto done_free_tx; + } + newreq = 1; qp->s_psn = wqe->psn; } @@ -491,6 +522,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) switch (wqe->wr.opcode) { case IB_WR_SEND: case IB_WR_SEND_WITH_IMM: + case IB_WR_SEND_WITH_INV: /* If no credit, return. */ if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) && cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) { @@ -504,11 +536,17 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } if (wqe->wr.opcode == IB_WR_SEND) { qp->s_state = OP(SEND_ONLY); - } else { + } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); /* Immediate data comes after the BTH */ ohdr->u.imm_data = wqe->wr.ex.imm_data; hwords += 1; + } else { + qp->s_state = OP(SEND_ONLY_WITH_INVALIDATE); + /* Invalidate rkey comes after the BTH */ + ohdr->u.ieth = cpu_to_be32( + wqe->wr.ex.invalidate_rkey); + hwords += 1; } if (wqe->wr.send_flags & IB_SEND_SOLICITED) bth0 |= IB_BTH_SOLICITED; @@ -671,11 +709,16 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } if (wqe->wr.opcode == IB_WR_SEND) { qp->s_state = OP(SEND_LAST); - } else { + } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); /* Immediate data comes after the BTH */ ohdr->u.imm_data = wqe->wr.ex.imm_data; hwords += 1; + } else { + qp->s_state = OP(SEND_LAST_WITH_INVALIDATE); + /* invalidate data comes after the BTH */ + ohdr->u.ieth = cpu_to_be32(wqe->wr.ex.invalidate_rkey); + hwords += 1; } if (wqe->wr.send_flags & IB_SEND_SOLICITED) bth0 |= IB_BTH_SOLICITED; @@ -1047,7 +1090,7 @@ void hfi1_rc_timeout(unsigned long arg) ibp->rvp.n_rc_timeouts++; qp->s_flags &= ~RVT_S_TIMER; del_timer(&qp->s_timer); - trace_hfi1_rc_timeout(qp, qp->s_last_psn + 1); + trace_hfi1_timeout(qp, qp->s_last_psn + 1); restart_rc(qp, qp->s_last_psn + 1, 1); hfi1_schedule_send(qp); } @@ -1171,7 +1214,7 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_ib_header *hdr) * If we were waiting for sends to complete before re-sending, * and they are now complete, restart sending. */ - trace_hfi1_rc_sendcomplete(qp, psn); + trace_hfi1_sendcomplete(qp, psn); if (qp->s_flags & RVT_S_WAIT_PSN && cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { qp->s_flags &= ~RVT_S_WAIT_PSN; @@ -1567,7 +1610,7 @@ static void rc_rcv_resp(struct hfi1_ibport *ibp, spin_lock_irqsave(&qp->s_lock, flags); - trace_hfi1_rc_ack(qp, psn); + trace_hfi1_ack(qp, psn); /* Ignore invalid responses. */ smp_read_barrier_depends(); /* see post_one_send */ @@ -1782,7 +1825,7 @@ static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data, u8 i, prev; int old_req; - trace_hfi1_rc_rcv_error(qp, psn); + trace_hfi1_rcv_error(qp, psn); if (diff > 0) { /* * Packet sequence error. @@ -2086,7 +2129,6 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) u32 tlen = packet->tlen; struct rvt_qp *qp = packet->qp; struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); - struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); struct hfi1_other_headers *ohdr = packet->ohdr; u32 bth0, opcode; u32 hdrsize = packet->hlen; @@ -2097,30 +2139,15 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) int diff; struct ib_reth *reth; unsigned long flags; - u32 bth1; int ret, is_fecn = 0; int copy_last = 0; + u32 rkey; bth0 = be32_to_cpu(ohdr->bth[0]); if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0)) return; - bth1 = be32_to_cpu(ohdr->bth[1]); - if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) { - if (bth1 & HFI1_BECN_SMASK) { - u16 rlid = qp->remote_ah_attr.dlid; - u32 lqpn, rqpn; - - lqpn = qp->ibqp.qp_num; - rqpn = qp->remote_qpn; - process_becn( - ppd, - qp->remote_ah_attr.sl, - rlid, lqpn, rqpn, - IB_CC_SVCTYPE_RC); - } - is_fecn = bth1 & HFI1_FECN_SMASK; - } + is_fecn = process_ecn(qp, packet, false); psn = be32_to_cpu(ohdr->bth[2]); opcode = (bth0 >> 24) & 0xff; @@ -2154,7 +2181,8 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) case OP(SEND_MIDDLE): if (opcode == OP(SEND_MIDDLE) || opcode == OP(SEND_LAST) || - opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + opcode == OP(SEND_LAST_WITH_IMMEDIATE) || + opcode == OP(SEND_LAST_WITH_INVALIDATE)) break; goto nack_inv; @@ -2170,6 +2198,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) if (opcode == OP(SEND_MIDDLE) || opcode == OP(SEND_LAST) || opcode == OP(SEND_LAST_WITH_IMMEDIATE) || + opcode == OP(SEND_LAST_WITH_INVALIDATE) || opcode == OP(RDMA_WRITE_MIDDLE) || opcode == OP(RDMA_WRITE_LAST) || opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) @@ -2218,6 +2247,7 @@ send_middle: case OP(SEND_ONLY): case OP(SEND_ONLY_WITH_IMMEDIATE): + case OP(SEND_ONLY_WITH_INVALIDATE): ret = hfi1_rvt_get_rwqe(qp, 0); if (ret < 0) goto nack_op_err; @@ -2226,12 +2256,22 @@ send_middle: qp->r_rcv_len = 0; if (opcode == OP(SEND_ONLY)) goto no_immediate_data; + if (opcode == OP(SEND_ONLY_WITH_INVALIDATE)) + goto send_last_inv; /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */ case OP(SEND_LAST_WITH_IMMEDIATE): send_last_imm: wc.ex.imm_data = ohdr->u.imm_data; wc.wc_flags = IB_WC_WITH_IMM; goto send_last; + case OP(SEND_LAST_WITH_INVALIDATE): +send_last_inv: + rkey = be32_to_cpu(ohdr->u.ieth); + if (rvt_invalidate_rkey(qp, rkey)) + goto no_immediate_data; + wc.ex.invalidate_rkey = rkey; + wc.wc_flags = IB_WC_WITH_INVALIDATE; + goto send_last; case OP(RDMA_WRITE_LAST): copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user; /* fall through */ diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index a659aec3c3c6..48d5094f98e2 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -372,6 +372,7 @@ static void ruc_loopback(struct rvt_qp *sqp) int ret; int copy_last = 0; u32 to; + int local_ops = 0; rcu_read_lock(); @@ -440,11 +441,31 @@ again: sqp->s_sge.num_sge = wqe->wr.num_sge; sqp->s_len = wqe->length; switch (wqe->wr.opcode) { + case IB_WR_REG_MR: + goto send_comp; + + case IB_WR_LOCAL_INV: + if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) { + if (rvt_invalidate_rkey(sqp, + wqe->wr.ex.invalidate_rkey)) + send_status = IB_WC_LOC_PROT_ERR; + local_ops = 1; + } + goto send_comp; + + case IB_WR_SEND_WITH_INV: + if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) { + wc.wc_flags = IB_WC_WITH_INVALIDATE; + wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey; + } + goto send; + case IB_WR_SEND_WITH_IMM: wc.wc_flags = IB_WC_WITH_IMM; wc.ex.imm_data = wqe->wr.ex.imm_data; /* FALLTHROUGH */ case IB_WR_SEND: +send: ret = hfi1_rvt_get_rwqe(qp, 0); if (ret < 0) goto op_err; @@ -583,6 +604,10 @@ send_comp: flush_send: sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; hfi1_send_complete(sqp, wqe, send_status); + if (local_ops) { + atomic_dec(&sqp->local_ops_pending); + local_ops = 0; + } goto again; rnr_nak: @@ -683,10 +708,10 @@ u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr, return sizeof(struct ib_grh) / sizeof(u32); } -#define BTH2_OFFSET (offsetof(struct hfi1_pio_header, hdr.u.oth.bth[2]) / 4) +#define BTH2_OFFSET (offsetof(struct hfi1_sdma_header, hdr.u.oth.bth[2]) / 4) /** - * build_ahg - create ahg in s_hdr + * build_ahg - create ahg in s_ahg * @qp: a pointer to QP * @npsn: the next PSN for the request/response * @@ -708,19 +733,18 @@ static inline void build_ahg(struct rvt_qp *qp, u32 npsn) qp->s_ahgidx = sdma_ahg_alloc(priv->s_sde); if (qp->s_ahgidx >= 0) { qp->s_ahgpsn = npsn; - priv->s_hdr->tx_flags |= SDMA_TXREQ_F_AHG_COPY; + priv->s_ahg->tx_flags |= SDMA_TXREQ_F_AHG_COPY; /* save to protect a change in another thread */ - priv->s_hdr->sde = priv->s_sde; - priv->s_hdr->ahgidx = qp->s_ahgidx; + priv->s_ahg->ahgidx = qp->s_ahgidx; qp->s_flags |= RVT_S_AHG_VALID; } } else { /* subsequent middle after valid */ if (qp->s_ahgidx >= 0) { - priv->s_hdr->tx_flags |= SDMA_TXREQ_F_USE_AHG; - priv->s_hdr->ahgidx = qp->s_ahgidx; - priv->s_hdr->ahgcount++; - priv->s_hdr->ahgdesc[0] = + priv->s_ahg->tx_flags |= SDMA_TXREQ_F_USE_AHG; + priv->s_ahg->ahgidx = qp->s_ahgidx; + priv->s_ahg->ahgcount++; + priv->s_ahg->ahgdesc[0] = sdma_build_ahg_descriptor( (__force u16)cpu_to_be16((u16)npsn), BTH2_OFFSET, @@ -728,8 +752,8 @@ static inline void build_ahg(struct rvt_qp *qp, u32 npsn) 16); if ((npsn & 0xffff0000) != (qp->s_ahgpsn & 0xffff0000)) { - priv->s_hdr->ahgcount++; - priv->s_hdr->ahgdesc[1] = + priv->s_ahg->ahgcount++; + priv->s_ahg->ahgdesc[1] = sdma_build_ahg_descriptor( (__force u16)cpu_to_be16( (u16)(npsn >> 16)), @@ -766,7 +790,7 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr, } lrh0 |= (priv->s_sc & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4; /* - * reset s_hdr/AHG fields + * reset s_ahg/AHG fields * * This insures that the ahgentry/ahgcount * are at a non-AHG default to protect @@ -776,10 +800,9 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr, * build_ahg() will modify as appropriate * to use the AHG feature. */ - priv->s_hdr->tx_flags = 0; - priv->s_hdr->ahgcount = 0; - priv->s_hdr->ahgidx = 0; - priv->s_hdr->sde = NULL; + priv->s_ahg->tx_flags = 0; + priv->s_ahg->ahgcount = 0; + priv->s_ahg->ahgidx = 0; if (qp->s_mig_state == IB_MIG_MIGRATED) bth0 |= IB_BTH_MIG_REQ; else @@ -890,7 +913,7 @@ void hfi1_do_send(struct rvt_qp *qp) */ if (hfi1_verbs_send(qp, &ps)) return; - /* Record that s_hdr is empty. */ + /* Record that s_ahg is empty. */ qp->s_hdrwords = 0; /* allow other tasks to run */ if (unlikely(time_after(jiffies, timeout))) { diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c index 91fc2aed6aed..74c84c655f7e 100644 --- a/drivers/infiniband/hw/hfi1/sysfs.c +++ b/drivers/infiniband/hw/hfi1/sysfs.c @@ -49,6 +49,7 @@ #include "hfi.h" #include "mad.h" #include "trace.h" +#include "affinity.h" /* * Start of per-port congestion control structures and support code @@ -622,6 +623,27 @@ static ssize_t show_tempsense(struct device *device, return ret; } +static ssize_t show_sdma_affinity(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + struct hfi1_devdata *dd = dd_from_dev(dev); + + return hfi1_get_sdma_affinity(dd, buf); +} + +static ssize_t store_sdma_affinity(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + struct hfi1_devdata *dd = dd_from_dev(dev); + + return hfi1_set_sdma_affinity(dd, buf, count); +} + /* * end of per-unit (or driver, in some cases, but replicated * per unit) functions @@ -636,6 +658,8 @@ static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL); static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL); static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL); static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset); +static DEVICE_ATTR(sdma_affinity, S_IWUSR | S_IRUGO, show_sdma_affinity, + store_sdma_affinity); static struct device_attribute *hfi1_attributes[] = { &dev_attr_hw_rev, @@ -646,6 +670,7 @@ static struct device_attribute *hfi1_attributes[] = { &dev_attr_boardversion, &dev_attr_tempsense, &dev_attr_chip_reset, + &dev_attr_sdma_affinity, }; int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num, diff --git a/drivers/infiniband/hw/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h index 28c1d0832886..92dc88f013c9 100644 --- a/drivers/infiniband/hw/hfi1/trace.h +++ b/drivers/infiniband/hw/hfi1/trace.h @@ -44,1329 +44,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ -#undef TRACE_SYSTEM_VAR -#define TRACE_SYSTEM_VAR hfi1 - -#if !defined(__HFI1_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) -#define __HFI1_TRACE_H - -#include <linux/tracepoint.h> -#include <linux/trace_seq.h> - -#include "hfi.h" -#include "mad.h" -#include "sdma.h" - -#define DD_DEV_ENTRY(dd) __string(dev, dev_name(&(dd)->pcidev->dev)) -#define DD_DEV_ASSIGN(dd) __assign_str(dev, dev_name(&(dd)->pcidev->dev)) - -#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype } -#define show_packettype(etype) \ -__print_symbolic(etype, \ - packettype_name(EXPECTED), \ - packettype_name(EAGER), \ - packettype_name(IB), \ - packettype_name(ERROR), \ - packettype_name(BYPASS)) - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_rx - -TRACE_EVENT(hfi1_rcvhdr, - TP_PROTO(struct hfi1_devdata *dd, - u32 ctxt, - u64 eflags, - u32 etype, - u32 hlen, - u32 tlen, - u32 updegr, - u32 etail - ), - TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail), - TP_STRUCT__entry(DD_DEV_ENTRY(dd) - __field(u64, eflags) - __field(u32, ctxt) - __field(u32, etype) - __field(u32, hlen) - __field(u32, tlen) - __field(u32, updegr) - __field(u32, etail) - ), - TP_fast_assign(DD_DEV_ASSIGN(dd); - __entry->eflags = eflags; - __entry->ctxt = ctxt; - __entry->etype = etype; - __entry->hlen = hlen; - __entry->tlen = tlen; - __entry->updegr = updegr; - __entry->etail = etail; - ), - TP_printk( - "[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d", - __get_str(dev), - __entry->ctxt, - __entry->eflags, - __entry->etype, show_packettype(__entry->etype), - __entry->hlen, - __entry->tlen, - __entry->updegr, - __entry->etail - ) -); - -TRACE_EVENT(hfi1_receive_interrupt, - TP_PROTO(struct hfi1_devdata *dd, u32 ctxt), - TP_ARGS(dd, ctxt), - TP_STRUCT__entry(DD_DEV_ENTRY(dd) - __field(u32, ctxt) - __field(u8, slow_path) - __field(u8, dma_rtail) - ), - TP_fast_assign(DD_DEV_ASSIGN(dd); - __entry->ctxt = ctxt; - if (dd->rcd[ctxt]->do_interrupt == - &handle_receive_interrupt) { - __entry->slow_path = 1; - __entry->dma_rtail = 0xFF; - } else if (dd->rcd[ctxt]->do_interrupt == - &handle_receive_interrupt_dma_rtail){ - __entry->dma_rtail = 1; - __entry->slow_path = 0; - } else if (dd->rcd[ctxt]->do_interrupt == - &handle_receive_interrupt_nodma_rtail) { - __entry->dma_rtail = 0; - __entry->slow_path = 0; - } - ), - TP_printk("[%s] ctxt %d SlowPath: %d DmaRtail: %d", - __get_str(dev), - __entry->ctxt, - __entry->slow_path, - __entry->dma_rtail - ) -); - -TRACE_EVENT(hfi1_exp_tid_reg, - TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr, - u32 npages, unsigned long va, unsigned long pa, - dma_addr_t dma), - TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma), - TP_STRUCT__entry( - __field(unsigned, ctxt) - __field(u16, subctxt) - __field(u32, rarr) - __field(u32, npages) - __field(unsigned long, va) - __field(unsigned long, pa) - __field(dma_addr_t, dma) - ), - TP_fast_assign( - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->rarr = rarr; - __entry->npages = npages; - __entry->va = va; - __entry->pa = pa; - __entry->dma = dma; - ), - TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx", - __entry->ctxt, - __entry->subctxt, - __entry->rarr, - __entry->npages, - __entry->pa, - __entry->va, - __entry->dma - ) - ); - -TRACE_EVENT(hfi1_exp_tid_unreg, - TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr, u32 npages, - unsigned long va, unsigned long pa, dma_addr_t dma), - TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma), - TP_STRUCT__entry( - __field(unsigned, ctxt) - __field(u16, subctxt) - __field(u32, rarr) - __field(u32, npages) - __field(unsigned long, va) - __field(unsigned long, pa) - __field(dma_addr_t, dma) - ), - TP_fast_assign( - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->rarr = rarr; - __entry->npages = npages; - __entry->va = va; - __entry->pa = pa; - __entry->dma = dma; - ), - TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx", - __entry->ctxt, - __entry->subctxt, - __entry->rarr, - __entry->npages, - __entry->pa, - __entry->va, - __entry->dma - ) - ); - -TRACE_EVENT(hfi1_exp_tid_inval, - TP_PROTO(unsigned ctxt, u16 subctxt, unsigned long va, u32 rarr, - u32 npages, dma_addr_t dma), - TP_ARGS(ctxt, subctxt, va, rarr, npages, dma), - TP_STRUCT__entry( - __field(unsigned, ctxt) - __field(u16, subctxt) - __field(unsigned long, va) - __field(u32, rarr) - __field(u32, npages) - __field(dma_addr_t, dma) - ), - TP_fast_assign( - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->va = va; - __entry->rarr = rarr; - __entry->npages = npages; - __entry->dma = dma; - ), - TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx", - __entry->ctxt, - __entry->subctxt, - __entry->rarr, - __entry->npages, - __entry->va, - __entry->dma - ) - ); - -TRACE_EVENT(hfi1_mmu_invalidate, - TP_PROTO(unsigned ctxt, u16 subctxt, const char *type, - unsigned long start, unsigned long end), - TP_ARGS(ctxt, subctxt, type, start, end), - TP_STRUCT__entry( - __field(unsigned, ctxt) - __field(u16, subctxt) - __string(type, type) - __field(unsigned long, start) - __field(unsigned long, end) - ), - TP_fast_assign( - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __assign_str(type, type); - __entry->start = start; - __entry->end = end; - ), - TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx", - __entry->ctxt, - __entry->subctxt, - __get_str(type), - __entry->start, - __entry->end - ) - ); - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_tx - -TRACE_EVENT(hfi1_piofree, - TP_PROTO(struct send_context *sc, int extra), - TP_ARGS(sc, extra), - TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd) - __field(u32, sw_index) - __field(u32, hw_context) - __field(int, extra) - ), - TP_fast_assign(DD_DEV_ASSIGN(sc->dd); - __entry->sw_index = sc->sw_index; - __entry->hw_context = sc->hw_context; - __entry->extra = extra; - ), - TP_printk("[%s] ctxt %u(%u) extra %d", - __get_str(dev), - __entry->sw_index, - __entry->hw_context, - __entry->extra - ) -); - -TRACE_EVENT(hfi1_wantpiointr, - TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl), - TP_ARGS(sc, needint, credit_ctrl), - TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd) - __field(u32, sw_index) - __field(u32, hw_context) - __field(u32, needint) - __field(u64, credit_ctrl) - ), - TP_fast_assign(DD_DEV_ASSIGN(sc->dd); - __entry->sw_index = sc->sw_index; - __entry->hw_context = sc->hw_context; - __entry->needint = needint; - __entry->credit_ctrl = credit_ctrl; - ), - TP_printk("[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx", - __get_str(dev), - __entry->sw_index, - __entry->hw_context, - __entry->needint, - (unsigned long long)__entry->credit_ctrl - ) -); - -DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template, - TP_PROTO(struct rvt_qp *qp, u32 flags), - TP_ARGS(qp, flags), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) - __field(u32, qpn) - __field(u32, flags) - __field(u32, s_flags) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) - __entry->flags = flags; - __entry->qpn = qp->ibqp.qp_num; - __entry->s_flags = qp->s_flags; - ), - TP_printk( - "[%s] qpn 0x%x flags 0x%x s_flags 0x%x", - __get_str(dev), - __entry->qpn, - __entry->flags, - __entry->s_flags - ) -); - -DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup, - TP_PROTO(struct rvt_qp *qp, u32 flags), - TP_ARGS(qp, flags)); - -DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep, - TP_PROTO(struct rvt_qp *qp, u32 flags), - TP_ARGS(qp, flags)); - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_ibhdrs - -u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr); -const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs); - -#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs) - -const char *parse_sdma_flags(struct trace_seq *p, u64 desc0, u64 desc1); - -#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1) - -#define lrh_name(lrh) { HFI1_##lrh, #lrh } -#define show_lnh(lrh) \ -__print_symbolic(lrh, \ - lrh_name(LRH_BTH), \ - lrh_name(LRH_GRH)) - -#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode } -#define show_ib_opcode(opcode) \ -__print_symbolic(opcode, \ - ib_opcode_name(RC_SEND_FIRST), \ - ib_opcode_name(RC_SEND_MIDDLE), \ - ib_opcode_name(RC_SEND_LAST), \ - ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE), \ - ib_opcode_name(RC_SEND_ONLY), \ - ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE), \ - ib_opcode_name(RC_RDMA_WRITE_FIRST), \ - ib_opcode_name(RC_RDMA_WRITE_MIDDLE), \ - ib_opcode_name(RC_RDMA_WRITE_LAST), \ - ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ - ib_opcode_name(RC_RDMA_WRITE_ONLY), \ - ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ - ib_opcode_name(RC_RDMA_READ_REQUEST), \ - ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST), \ - ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE), \ - ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST), \ - ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY), \ - ib_opcode_name(RC_ACKNOWLEDGE), \ - ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \ - ib_opcode_name(RC_COMPARE_SWAP), \ - ib_opcode_name(RC_FETCH_ADD), \ - ib_opcode_name(RC_SEND_LAST_WITH_INVALIDATE), \ - ib_opcode_name(RC_SEND_ONLY_WITH_INVALIDATE), \ - ib_opcode_name(UC_SEND_FIRST), \ - ib_opcode_name(UC_SEND_MIDDLE), \ - ib_opcode_name(UC_SEND_LAST), \ - ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE), \ - ib_opcode_name(UC_SEND_ONLY), \ - ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE), \ - ib_opcode_name(UC_RDMA_WRITE_FIRST), \ - ib_opcode_name(UC_RDMA_WRITE_MIDDLE), \ - ib_opcode_name(UC_RDMA_WRITE_LAST), \ - ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ - ib_opcode_name(UC_RDMA_WRITE_ONLY), \ - ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ - ib_opcode_name(UD_SEND_ONLY), \ - ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE), \ - ib_opcode_name(CNP)) - -#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x" -#define BTH_PRN \ - "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \ - "f %d b %d qpn 0x%.6x a %d psn 0x%.8x" -#define EHDR_PRN "%s" - -DECLARE_EVENT_CLASS(hfi1_ibhdr_template, - TP_PROTO(struct hfi1_devdata *dd, - struct hfi1_ib_header *hdr), - TP_ARGS(dd, hdr), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - /* LRH */ - __field(u8, vl) - __field(u8, lver) - __field(u8, sl) - __field(u8, lnh) - __field(u16, dlid) - __field(u16, len) - __field(u16, slid) - /* BTH */ - __field(u8, opcode) - __field(u8, se) - __field(u8, m) - __field(u8, pad) - __field(u8, tver) - __field(u16, pkey) - __field(u8, f) - __field(u8, b) - __field(u32, qpn) - __field(u8, a) - __field(u32, psn) - /* extended headers */ - __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr)) - ), - TP_fast_assign( - struct hfi1_other_headers *ohdr; - - DD_DEV_ASSIGN(dd); - /* LRH */ - __entry->vl = - (u8)(be16_to_cpu(hdr->lrh[0]) >> 12); - __entry->lver = - (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf; - __entry->sl = - (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; - __entry->lnh = - (u8)(be16_to_cpu(hdr->lrh[0]) & 3); - __entry->dlid = - be16_to_cpu(hdr->lrh[1]); - /* allow for larger len */ - __entry->len = - be16_to_cpu(hdr->lrh[2]); - __entry->slid = - be16_to_cpu(hdr->lrh[3]); - /* BTH */ - if (__entry->lnh == HFI1_LRH_BTH) - ohdr = &hdr->u.oth; - else - ohdr = &hdr->u.l.oth; - __entry->opcode = - (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; - __entry->se = - (be32_to_cpu(ohdr->bth[0]) >> 23) & 1; - __entry->m = - (be32_to_cpu(ohdr->bth[0]) >> 22) & 1; - __entry->pad = - (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; - __entry->tver = - (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf; - __entry->pkey = - be32_to_cpu(ohdr->bth[0]) & 0xffff; - __entry->f = - (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) & - HFI1_FECN_MASK; - __entry->b = - (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) & - HFI1_BECN_MASK; - __entry->qpn = - be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; - __entry->a = - (be32_to_cpu(ohdr->bth[2]) >> 31) & 1; - /* allow for larger PSN */ - __entry->psn = - be32_to_cpu(ohdr->bth[2]) & 0x7fffffff; - /* extended headers */ - memcpy(__get_dynamic_array(ehdrs), &ohdr->u, - ibhdr_exhdr_len(hdr)); - ), - TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN, - __get_str(dev), - /* LRH */ - __entry->vl, - __entry->lver, - __entry->sl, - __entry->lnh, show_lnh(__entry->lnh), - __entry->dlid, - __entry->len, - __entry->slid, - /* BTH */ - __entry->opcode, show_ib_opcode(__entry->opcode), - __entry->se, - __entry->m, - __entry->pad, - __entry->tver, - __entry->pkey, - __entry->f, - __entry->b, - __entry->qpn, - __entry->a, - __entry->psn, - /* extended headers */ - __parse_ib_ehdrs( - __entry->opcode, - (void *)__get_dynamic_array(ehdrs)) - ) -); - -DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr, - TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), - TP_ARGS(dd, hdr)); - -DEFINE_EVENT(hfi1_ibhdr_template, pio_output_ibhdr, - TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), - TP_ARGS(dd, hdr)); - -DEFINE_EVENT(hfi1_ibhdr_template, ack_output_ibhdr, - TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), - TP_ARGS(dd, hdr)); - -DEFINE_EVENT(hfi1_ibhdr_template, sdma_output_ibhdr, - TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), - TP_ARGS(dd, hdr)); - -#define SNOOP_PRN \ - "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \ - "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]" - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_snoop - -TRACE_EVENT(snoop_capture, - TP_PROTO(struct hfi1_devdata *dd, - int hdr_len, - struct hfi1_ib_header *hdr, - int data_len, - void *data), - TP_ARGS(dd, hdr_len, hdr, data_len, data), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - __field(u16, slid) - __field(u16, dlid) - __field(u32, qpn) - __field(u8, opcode) - __field(u8, sl) - __field(u16, pkey) - __field(u32, hdr_len) - __field(u32, data_len) - __field(u8, lnh) - __dynamic_array(u8, raw_hdr, hdr_len) - __dynamic_array(u8, raw_pkt, data_len) - ), - TP_fast_assign( - struct hfi1_other_headers *ohdr; - - __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3); - if (__entry->lnh == HFI1_LRH_BTH) - ohdr = &hdr->u.oth; - else - ohdr = &hdr->u.l.oth; - DD_DEV_ASSIGN(dd); - __entry->slid = be16_to_cpu(hdr->lrh[3]); - __entry->dlid = be16_to_cpu(hdr->lrh[1]); - __entry->qpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; - __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; - __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; - __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff; - __entry->hdr_len = hdr_len; - __entry->data_len = data_len; - memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len); - memcpy(__get_dynamic_array(raw_pkt), data, data_len); - ), - TP_printk( - "[%s] " SNOOP_PRN, - __get_str(dev), - __entry->slid, - __entry->dlid, - __entry->qpn, - __entry->opcode, - show_ib_opcode(__entry->opcode), - __entry->sl, - __entry->pkey, - __entry->hdr_len, - __entry->data_len - ) -); - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_ctxts - -#define UCTXT_FMT \ - "cred:%u, credaddr:0x%llx, piobase:0x%llx, rcvhdr_cnt:%u, " \ - "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx" -TRACE_EVENT(hfi1_uctxtdata, - TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt), - TP_ARGS(dd, uctxt), - TP_STRUCT__entry(DD_DEV_ENTRY(dd) - __field(unsigned, ctxt) - __field(u32, credits) - __field(u64, hw_free) - __field(u64, piobase) - __field(u16, rcvhdrq_cnt) - __field(u64, rcvhdrq_phys) - __field(u32, eager_cnt) - __field(u64, rcvegr_phys) - ), - TP_fast_assign(DD_DEV_ASSIGN(dd); - __entry->ctxt = uctxt->ctxt; - __entry->credits = uctxt->sc->credits; - __entry->hw_free = (u64)uctxt->sc->hw_free; - __entry->piobase = (u64)uctxt->sc->base_addr; - __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt; - __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys; - __entry->eager_cnt = uctxt->egrbufs.alloced; - __entry->rcvegr_phys = - uctxt->egrbufs.rcvtids[0].phys; - ), - TP_printk("[%s] ctxt %u " UCTXT_FMT, - __get_str(dev), - __entry->ctxt, - __entry->credits, - __entry->hw_free, - __entry->piobase, - __entry->rcvhdrq_cnt, - __entry->rcvhdrq_phys, - __entry->eager_cnt, - __entry->rcvegr_phys - ) -); - -#define CINFO_FMT \ - "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u" -TRACE_EVENT(hfi1_ctxt_info, - TP_PROTO(struct hfi1_devdata *dd, unsigned ctxt, unsigned subctxt, - struct hfi1_ctxt_info cinfo), - TP_ARGS(dd, ctxt, subctxt, cinfo), - TP_STRUCT__entry(DD_DEV_ENTRY(dd) - __field(unsigned, ctxt) - __field(unsigned, subctxt) - __field(u16, egrtids) - __field(u16, rcvhdrq_cnt) - __field(u16, rcvhdrq_size) - __field(u16, sdma_ring_size) - __field(u32, rcvegr_size) - ), - TP_fast_assign(DD_DEV_ASSIGN(dd); - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->egrtids = cinfo.egrtids; - __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt; - __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize; - __entry->sdma_ring_size = cinfo.sdma_ring_size; - __entry->rcvegr_size = cinfo.rcvegr_size; - ), - TP_printk("[%s] ctxt %u:%u " CINFO_FMT, - __get_str(dev), - __entry->ctxt, - __entry->subctxt, - __entry->egrtids, - __entry->rcvegr_size, - __entry->rcvhdrq_cnt, - __entry->rcvhdrq_size, - __entry->sdma_ring_size - ) -); - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_sma - -#define BCT_FORMAT \ - "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]" - -#define BCT(field) \ - be16_to_cpu( \ - ((struct buffer_control *)__get_dynamic_array(bct))->field \ - ) - -DECLARE_EVENT_CLASS(hfi1_bct_template, - TP_PROTO(struct hfi1_devdata *dd, - struct buffer_control *bc), - TP_ARGS(dd, bc), - TP_STRUCT__entry(DD_DEV_ENTRY(dd) - __dynamic_array(u8, bct, sizeof(*bc)) - ), - TP_fast_assign(DD_DEV_ASSIGN(dd); - memcpy(__get_dynamic_array(bct), bc, - sizeof(*bc)); - ), - TP_printk(BCT_FORMAT, - BCT(overall_shared_limit), - - BCT(vl[0].dedicated), - BCT(vl[0].shared), - - BCT(vl[1].dedicated), - BCT(vl[1].shared), - - BCT(vl[2].dedicated), - BCT(vl[2].shared), - - BCT(vl[3].dedicated), - BCT(vl[3].shared), - - BCT(vl[4].dedicated), - BCT(vl[4].shared), - - BCT(vl[5].dedicated), - BCT(vl[5].shared), - - BCT(vl[6].dedicated), - BCT(vl[6].shared), - - BCT(vl[7].dedicated), - BCT(vl[7].shared), - - BCT(vl[15].dedicated), - BCT(vl[15].shared) - ) -); - -DEFINE_EVENT(hfi1_bct_template, bct_set, - TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc), - TP_ARGS(dd, bc)); - -DEFINE_EVENT(hfi1_bct_template, bct_get, - TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc), - TP_ARGS(dd, bc)); - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_sdma - -TRACE_EVENT(hfi1_sdma_descriptor, - TP_PROTO(struct sdma_engine *sde, - u64 desc0, - u64 desc1, - u16 e, - void *descp), - TP_ARGS(sde, desc0, desc1, e, descp), - TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) - __field(void *, descp) - __field(u64, desc0) - __field(u64, desc1) - __field(u16, e) - __field(u8, idx) - ), - TP_fast_assign(DD_DEV_ASSIGN(sde->dd); - __entry->desc0 = desc0; - __entry->desc1 = desc1; - __entry->idx = sde->this_idx; - __entry->descp = descp; - __entry->e = e; - ), - TP_printk( - "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u", - __get_str(dev), - __entry->idx, - __parse_sdma_flags(__entry->desc0, __entry->desc1), - (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) & - SDMA_DESC0_PHY_ADDR_MASK, - (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) & - SDMA_DESC1_GENERATION_MASK), - (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) & - SDMA_DESC0_BYTE_COUNT_MASK), - __entry->desc0, - __entry->desc1, - __entry->descp, - __entry->e - ) -); - -TRACE_EVENT(hfi1_sdma_engine_select, - TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx), - TP_ARGS(dd, sel, vl, idx), - TP_STRUCT__entry(DD_DEV_ENTRY(dd) - __field(u32, sel) - __field(u8, vl) - __field(u8, idx) - ), - TP_fast_assign(DD_DEV_ASSIGN(dd); - __entry->sel = sel; - __entry->vl = vl; - __entry->idx = idx; - ), - TP_printk("[%s] selecting SDE %u sel 0x%x vl %u", - __get_str(dev), - __entry->idx, - __entry->sel, - __entry->vl - ) -); - -DECLARE_EVENT_CLASS(hfi1_sdma_engine_class, - TP_PROTO(struct sdma_engine *sde, u64 status), - TP_ARGS(sde, status), - TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) - __field(u64, status) - __field(u8, idx) - ), - TP_fast_assign(DD_DEV_ASSIGN(sde->dd); - __entry->status = status; - __entry->idx = sde->this_idx; - ), - TP_printk("[%s] SDE(%u) status %llx", - __get_str(dev), - __entry->idx, - (unsigned long long)__entry->status - ) -); - -DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt, - TP_PROTO(struct sdma_engine *sde, u64 status), - TP_ARGS(sde, status) -); - -DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress, - TP_PROTO(struct sdma_engine *sde, u64 status), - TP_ARGS(sde, status) -); - -DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad, - TP_PROTO(struct sdma_engine *sde, int aidx), - TP_ARGS(sde, aidx), - TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) - __field(int, aidx) - __field(u8, idx) - ), - TP_fast_assign(DD_DEV_ASSIGN(sde->dd); - __entry->idx = sde->this_idx; - __entry->aidx = aidx; - ), - TP_printk("[%s] SDE(%u) aidx %d", - __get_str(dev), - __entry->idx, - __entry->aidx - ) -); - -DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate, - TP_PROTO(struct sdma_engine *sde, int aidx), - TP_ARGS(sde, aidx)); - -DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate, - TP_PROTO(struct sdma_engine *sde, int aidx), - TP_ARGS(sde, aidx)); - -#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER -TRACE_EVENT(hfi1_sdma_progress, - TP_PROTO(struct sdma_engine *sde, - u16 hwhead, - u16 swhead, - struct sdma_txreq *txp - ), - TP_ARGS(sde, hwhead, swhead, txp), - TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) - __field(u64, sn) - __field(u16, hwhead) - __field(u16, swhead) - __field(u16, txnext) - __field(u16, tx_tail) - __field(u16, tx_head) - __field(u8, idx) - ), - TP_fast_assign(DD_DEV_ASSIGN(sde->dd); - __entry->hwhead = hwhead; - __entry->swhead = swhead; - __entry->tx_tail = sde->tx_tail; - __entry->tx_head = sde->tx_head; - __entry->txnext = txp ? txp->next_descq_idx : ~0; - __entry->idx = sde->this_idx; - __entry->sn = txp ? txp->sn : ~0; - ), - TP_printk( - "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u", - __get_str(dev), - __entry->idx, - __entry->sn, - __entry->hwhead, - __entry->swhead, - __entry->txnext, - __entry->tx_head, - __entry->tx_tail - ) -); -#else -TRACE_EVENT(hfi1_sdma_progress, - TP_PROTO(struct sdma_engine *sde, - u16 hwhead, u16 swhead, - struct sdma_txreq *txp - ), - TP_ARGS(sde, hwhead, swhead, txp), - TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) - __field(u16, hwhead) - __field(u16, swhead) - __field(u16, txnext) - __field(u16, tx_tail) - __field(u16, tx_head) - __field(u8, idx) - ), - TP_fast_assign(DD_DEV_ASSIGN(sde->dd); - __entry->hwhead = hwhead; - __entry->swhead = swhead; - __entry->tx_tail = sde->tx_tail; - __entry->tx_head = sde->tx_head; - __entry->txnext = txp ? txp->next_descq_idx : ~0; - __entry->idx = sde->this_idx; - ), - TP_printk( - "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u", - __get_str(dev), - __entry->idx, - __entry->hwhead, - __entry->swhead, - __entry->txnext, - __entry->tx_head, - __entry->tx_tail - ) -); -#endif - -DECLARE_EVENT_CLASS(hfi1_sdma_sn, - TP_PROTO(struct sdma_engine *sde, u64 sn), - TP_ARGS(sde, sn), - TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) - __field(u64, sn) - __field(u8, idx) - ), - TP_fast_assign(DD_DEV_ASSIGN(sde->dd); - __entry->sn = sn; - __entry->idx = sde->this_idx; - ), - TP_printk("[%s] SDE(%u) sn %llu", - __get_str(dev), - __entry->idx, - __entry->sn - ) -); - -DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn, - TP_PROTO( - struct sdma_engine *sde, - u64 sn - ), - TP_ARGS(sde, sn) -); - -DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn, - TP_PROTO(struct sdma_engine *sde, u64 sn), - TP_ARGS(sde, sn) -); - -#define USDMA_HDR_FORMAT \ - "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x" - -TRACE_EVENT(hfi1_sdma_user_header, - TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req, - struct hfi1_pkt_header *hdr, u32 tidval), - TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - __field(u16, ctxt) - __field(u8, subctxt) - __field(u16, req) - __field(__le32, pbc0) - __field(__le32, pbc1) - __field(__be32, lrh0) - __field(__be32, lrh1) - __field(__be32, bth0) - __field(__be32, bth1) - __field(__be32, bth2) - __field(__le32, kdeth0) - __field(__le32, kdeth1) - __field(__le32, kdeth2) - __field(__le32, kdeth3) - __field(__le32, kdeth4) - __field(__le32, kdeth5) - __field(__le32, kdeth6) - __field(__le32, kdeth7) - __field(__le32, kdeth8) - __field(u32, tidval) - ), - TP_fast_assign( - __le32 *pbc = (__le32 *)hdr->pbc; - __be32 *lrh = (__be32 *)hdr->lrh; - __be32 *bth = (__be32 *)hdr->bth; - __le32 *kdeth = (__le32 *)&hdr->kdeth; - - DD_DEV_ASSIGN(dd); - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->req = req; - __entry->pbc0 = pbc[0]; - __entry->pbc1 = pbc[1]; - __entry->lrh0 = be32_to_cpu(lrh[0]); - __entry->lrh1 = be32_to_cpu(lrh[1]); - __entry->bth0 = be32_to_cpu(bth[0]); - __entry->bth1 = be32_to_cpu(bth[1]); - __entry->bth2 = be32_to_cpu(bth[2]); - __entry->kdeth0 = kdeth[0]; - __entry->kdeth1 = kdeth[1]; - __entry->kdeth2 = kdeth[2]; - __entry->kdeth3 = kdeth[3]; - __entry->kdeth4 = kdeth[4]; - __entry->kdeth5 = kdeth[5]; - __entry->kdeth6 = kdeth[6]; - __entry->kdeth7 = kdeth[7]; - __entry->kdeth8 = kdeth[8]; - __entry->tidval = tidval; - ), - TP_printk(USDMA_HDR_FORMAT, - __get_str(dev), - __entry->ctxt, - __entry->subctxt, - __entry->req, - __entry->pbc1, - __entry->pbc0, - __entry->lrh0, - __entry->lrh1, - __entry->bth0, - __entry->bth1, - __entry->bth2, - __entry->kdeth0, - __entry->kdeth1, - __entry->kdeth2, - __entry->kdeth3, - __entry->kdeth4, - __entry->kdeth5, - __entry->kdeth6, - __entry->kdeth7, - __entry->kdeth8, - __entry->tidval - ) - ); - -#define SDMA_UREQ_FMT \ - "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u" -TRACE_EVENT(hfi1_sdma_user_reqinfo, - TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i), - TP_ARGS(dd, ctxt, subctxt, i), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd); - __field(u16, ctxt) - __field(u8, subctxt) - __field(u8, ver_opcode) - __field(u8, iovcnt) - __field(u16, npkts) - __field(u16, fragsize) - __field(u16, comp_idx) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd); - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->ver_opcode = i[0] & 0xff; - __entry->iovcnt = (i[0] >> 8) & 0xff; - __entry->npkts = i[1]; - __entry->fragsize = i[2]; - __entry->comp_idx = i[3]; - ), - TP_printk(SDMA_UREQ_FMT, - __get_str(dev), - __entry->ctxt, - __entry->subctxt, - __entry->ver_opcode, - __entry->iovcnt, - __entry->npkts, - __entry->fragsize, - __entry->comp_idx - ) - ); - -#define usdma_complete_name(st) { st, #st } -#define show_usdma_complete_state(st) \ - __print_symbolic(st, \ - usdma_complete_name(FREE), \ - usdma_complete_name(QUEUED), \ - usdma_complete_name(COMPLETE), \ - usdma_complete_name(ERROR)) - -TRACE_EVENT(hfi1_sdma_user_completion, - TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx, - u8 state, int code), - TP_ARGS(dd, ctxt, subctxt, idx, state, code), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - __field(u16, ctxt) - __field(u8, subctxt) - __field(u16, idx) - __field(u8, state) - __field(int, code) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd); - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->idx = idx; - __entry->state = state; - __entry->code = code; - ), - TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)", - __get_str(dev), __entry->ctxt, __entry->subctxt, - __entry->idx, show_usdma_complete_state(__entry->state), - __entry->code) - ); - -const char *print_u32_array(struct trace_seq *, u32 *, int); -#define __print_u32_hex(arr, len) print_u32_array(p, arr, len) - -TRACE_EVENT(hfi1_sdma_user_header_ahg, - TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req, - u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval), - TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - __field(u16, ctxt) - __field(u8, subctxt) - __field(u16, req) - __field(u8, sde) - __field(u8, idx) - __field(int, len) - __field(u32, tidval) - __array(u32, ahg, 10) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd); - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->req = req; - __entry->sde = sde; - __entry->idx = ahgidx; - __entry->len = len; - __entry->tidval = tidval; - memcpy(__entry->ahg, ahg, len * sizeof(u32)); - ), - TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x", - __get_str(dev), - __entry->ctxt, - __entry->subctxt, - __entry->req, - __entry->sde, - __entry->idx, - __entry->len - 1, - __print_u32_hex(__entry->ahg, __entry->len), - __entry->tidval - ) - ); - -TRACE_EVENT(hfi1_sdma_state, - TP_PROTO(struct sdma_engine *sde, - const char *cstate, - const char *nstate - ), - TP_ARGS(sde, cstate, nstate), - TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) - __string(curstate, cstate) - __string(newstate, nstate) - ), - TP_fast_assign(DD_DEV_ASSIGN(sde->dd); - __assign_str(curstate, cstate); - __assign_str(newstate, nstate); - ), - TP_printk("[%s] current state %s new state %s", - __get_str(dev), - __get_str(curstate), - __get_str(newstate) - ) -); - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_rc - -DECLARE_EVENT_CLASS(hfi1_rc_template, - TP_PROTO(struct rvt_qp *qp, u32 psn), - TP_ARGS(qp, psn), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) - __field(u32, qpn) - __field(u32, s_flags) - __field(u32, psn) - __field(u32, s_psn) - __field(u32, s_next_psn) - __field(u32, s_sending_psn) - __field(u32, s_sending_hpsn) - __field(u32, r_psn) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) - __entry->qpn = qp->ibqp.qp_num; - __entry->s_flags = qp->s_flags; - __entry->psn = psn; - __entry->s_psn = qp->s_psn; - __entry->s_next_psn = qp->s_next_psn; - __entry->s_sending_psn = qp->s_sending_psn; - __entry->s_sending_hpsn = qp->s_sending_hpsn; - __entry->r_psn = qp->r_psn; - ), - TP_printk( - "[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x", - __get_str(dev), - __entry->qpn, - __entry->s_flags, - __entry->psn, - __entry->s_psn, - __entry->s_next_psn, - __entry->s_sending_psn, - __entry->s_sending_hpsn, - __entry->r_psn - ) -); - -DEFINE_EVENT(hfi1_rc_template, hfi1_rc_sendcomplete, - TP_PROTO(struct rvt_qp *qp, u32 psn), - TP_ARGS(qp, psn) -); - -DEFINE_EVENT(hfi1_rc_template, hfi1_rc_ack, - TP_PROTO(struct rvt_qp *qp, u32 psn), - TP_ARGS(qp, psn) -); - -DEFINE_EVENT(hfi1_rc_template, hfi1_rc_timeout, - TP_PROTO(struct rvt_qp *qp, u32 psn), - TP_ARGS(qp, psn) -); - -DEFINE_EVENT(hfi1_rc_template, hfi1_rc_rcv_error, - TP_PROTO(struct rvt_qp *qp, u32 psn), - TP_ARGS(qp, psn) -); - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_misc - -TRACE_EVENT(hfi1_interrupt, - TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry, - int src), - TP_ARGS(dd, is_entry, src), - TP_STRUCT__entry(DD_DEV_ENTRY(dd) - __array(char, buf, 64) - __field(int, src) - ), - TP_fast_assign(DD_DEV_ASSIGN(dd) - is_entry->is_name(__entry->buf, 64, - src - is_entry->start); - __entry->src = src; - ), - TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf, - __entry->src) -); - -/* - * Note: - * This produces a REALLY ugly trace in the console output when the string is - * too long. - */ - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_trace - -#define MAX_MSG_LEN 512 - -DECLARE_EVENT_CLASS(hfi1_trace_template, - TP_PROTO(const char *function, struct va_format *vaf), - TP_ARGS(function, vaf), - TP_STRUCT__entry(__string(function, function) - __dynamic_array(char, msg, MAX_MSG_LEN) - ), - TP_fast_assign(__assign_str(function, function); - WARN_ON_ONCE(vsnprintf - (__get_dynamic_array(msg), - MAX_MSG_LEN, vaf->fmt, - *vaf->va) >= - MAX_MSG_LEN); - ), - TP_printk("(%s) %s", - __get_str(function), - __get_str(msg)) -); - -/* - * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an - * actual function to work and can not be in a macro. - */ -#define __hfi1_trace_def(lvl) \ -void __hfi1_trace_##lvl(const char *funct, char *fmt, ...); \ - \ -DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl, \ - TP_PROTO(const char *function, struct va_format *vaf), \ - TP_ARGS(function, vaf)) - -#define __hfi1_trace_fn(lvl) \ -void __hfi1_trace_##lvl(const char *func, char *fmt, ...) \ -{ \ - struct va_format vaf = { \ - .fmt = fmt, \ - }; \ - va_list args; \ - \ - va_start(args, fmt); \ - vaf.va = &args; \ - trace_hfi1_ ##lvl(func, &vaf); \ - va_end(args); \ - return; \ -} - -/* - * To create a new trace level simply define it below and as a __hfi1_trace_fn - * in trace.c. This will create all the hooks for calling - * hfi1_cdbg(LVL, fmt, ...); as well as take care of all - * the debugfs stuff. - */ -__hfi1_trace_def(PKT); -__hfi1_trace_def(PROC); -__hfi1_trace_def(SDMA); -__hfi1_trace_def(LINKVERB); -__hfi1_trace_def(DEBUG); -__hfi1_trace_def(SNOOP); -__hfi1_trace_def(CNTR); -__hfi1_trace_def(PIO); -__hfi1_trace_def(DC8051); -__hfi1_trace_def(FIRMWARE); -__hfi1_trace_def(RCVCTRL); -__hfi1_trace_def(TID); -__hfi1_trace_def(MMU); -__hfi1_trace_def(IOCTL); - -#define hfi1_cdbg(which, fmt, ...) \ - __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__) - -#define hfi1_dbg(fmt, ...) \ - hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__) - -/* - * Define HFI1_EARLY_DBG at compile time or here to enable early trace - * messages. Do not check in an enablement for this. - */ - -#ifdef HFI1_EARLY_DBG -#define hfi1_dbg_early(fmt, ...) \ - trace_printk(fmt, ##__VA_ARGS__) -#else -#define hfi1_dbg_early(fmt, ...) -#endif - -#endif /* __HFI1_TRACE_H */ - -#undef TRACE_INCLUDE_PATH -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_PATH . -#define TRACE_INCLUDE_FILE trace -#include <trace/define_trace.h> +#include "trace_dbg.h" +#include "trace_misc.h" +#include "trace_ctxts.h" +#include "trace_ibhdrs.h" +#include "trace_rc.h" +#include "trace_rx.h" +#include "trace_tx.h" diff --git a/drivers/infiniband/hw/hfi1/trace_ctxts.h b/drivers/infiniband/hw/hfi1/trace_ctxts.h new file mode 100644 index 000000000000..31654bbac1cf --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_ctxts.h @@ -0,0 +1,141 @@ +/* +* Copyright(c) 2015, 2016 Intel Corporation. +* +* This file is provided under a dual BSD/GPLv2 license. When using or +* redistributing this file, you may do so under either license. +* +* GPL LICENSE SUMMARY +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of version 2 of the GNU General Public License as +* published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* BSD LICENSE +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of Intel Corporation nor the names of its +* contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ +#if !defined(__HFI1_TRACE_CTXTS_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_CTXTS_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "hfi.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_ctxts + +#define UCTXT_FMT \ + "cred:%u, credaddr:0x%llx, piobase:0x%p, rcvhdr_cnt:%u, " \ + "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx" +TRACE_EVENT(hfi1_uctxtdata, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt), + TP_ARGS(dd, uctxt), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(unsigned int, ctxt) + __field(u32, credits) + __field(u64, hw_free) + __field(void __iomem *, piobase) + __field(u16, rcvhdrq_cnt) + __field(u64, rcvhdrq_phys) + __field(u32, eager_cnt) + __field(u64, rcvegr_phys) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = uctxt->ctxt; + __entry->credits = uctxt->sc->credits; + __entry->hw_free = le64_to_cpu(*uctxt->sc->hw_free); + __entry->piobase = uctxt->sc->base_addr; + __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt; + __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys; + __entry->eager_cnt = uctxt->egrbufs.alloced; + __entry->rcvegr_phys = + uctxt->egrbufs.rcvtids[0].phys; + ), + TP_printk("[%s] ctxt %u " UCTXT_FMT, + __get_str(dev), + __entry->ctxt, + __entry->credits, + __entry->hw_free, + __entry->piobase, + __entry->rcvhdrq_cnt, + __entry->rcvhdrq_phys, + __entry->eager_cnt, + __entry->rcvegr_phys + ) +); + +#define CINFO_FMT \ + "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u" +TRACE_EVENT(hfi1_ctxt_info, + TP_PROTO(struct hfi1_devdata *dd, unsigned int ctxt, + unsigned int subctxt, + struct hfi1_ctxt_info cinfo), + TP_ARGS(dd, ctxt, subctxt, cinfo), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(unsigned int, ctxt) + __field(unsigned int, subctxt) + __field(u16, egrtids) + __field(u16, rcvhdrq_cnt) + __field(u16, rcvhdrq_size) + __field(u16, sdma_ring_size) + __field(u32, rcvegr_size) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->egrtids = cinfo.egrtids; + __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt; + __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize; + __entry->sdma_ring_size = cinfo.sdma_ring_size; + __entry->rcvegr_size = cinfo.rcvegr_size; + ), + TP_printk("[%s] ctxt %u:%u " CINFO_FMT, + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->egrtids, + __entry->rcvegr_size, + __entry->rcvhdrq_cnt, + __entry->rcvhdrq_size, + __entry->sdma_ring_size + ) +); + +#endif /* __HFI1_TRACE_CTXTS_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_ctxts +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h new file mode 100644 index 000000000000..0e7d929530c5 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_dbg.h @@ -0,0 +1,155 @@ +/* +* Copyright(c) 2015, 2016 Intel Corporation. +* +* This file is provided under a dual BSD/GPLv2 license. When using or +* redistributing this file, you may do so under either license. +* +* GPL LICENSE SUMMARY +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of version 2 of the GNU General Public License as +* published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* BSD LICENSE +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of Intel Corporation nor the names of its +* contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ +#if !defined(__HFI1_TRACE_EXTRA_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_EXTRA_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "hfi.h" + +/* + * Note: + * This produces a REALLY ugly trace in the console output when the string is + * too long. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_dbg + +#define MAX_MSG_LEN 512 + +DECLARE_EVENT_CLASS(hfi1_trace_template, + TP_PROTO(const char *function, struct va_format *vaf), + TP_ARGS(function, vaf), + TP_STRUCT__entry(__string(function, function) + __dynamic_array(char, msg, MAX_MSG_LEN) + ), + TP_fast_assign(__assign_str(function, function); + WARN_ON_ONCE(vsnprintf + (__get_dynamic_array(msg), + MAX_MSG_LEN, vaf->fmt, + *vaf->va) >= + MAX_MSG_LEN); + ), + TP_printk("(%s) %s", + __get_str(function), + __get_str(msg)) +); + +/* + * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an + * actual function to work and can not be in a macro. + */ +#define __hfi1_trace_def(lvl) \ +void __hfi1_trace_##lvl(const char *funct, char *fmt, ...); \ + \ +DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl, \ + TP_PROTO(const char *function, struct va_format *vaf), \ + TP_ARGS(function, vaf)) + +#define __hfi1_trace_fn(lvl) \ +void __hfi1_trace_##lvl(const char *func, char *fmt, ...) \ +{ \ + struct va_format vaf = { \ + .fmt = fmt, \ + }; \ + va_list args; \ + \ + va_start(args, fmt); \ + vaf.va = &args; \ + trace_hfi1_ ##lvl(func, &vaf); \ + va_end(args); \ + return; \ +} + +/* + * To create a new trace level simply define it below and as a __hfi1_trace_fn + * in trace.c. This will create all the hooks for calling + * hfi1_cdbg(LVL, fmt, ...); as well as take care of all + * the debugfs stuff. + */ +__hfi1_trace_def(PKT); +__hfi1_trace_def(PROC); +__hfi1_trace_def(SDMA); +__hfi1_trace_def(LINKVERB); +__hfi1_trace_def(DEBUG); +__hfi1_trace_def(SNOOP); +__hfi1_trace_def(CNTR); +__hfi1_trace_def(PIO); +__hfi1_trace_def(DC8051); +__hfi1_trace_def(FIRMWARE); +__hfi1_trace_def(RCVCTRL); +__hfi1_trace_def(TID); +__hfi1_trace_def(MMU); +__hfi1_trace_def(IOCTL); + +#define hfi1_cdbg(which, fmt, ...) \ + __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__) + +#define hfi1_dbg(fmt, ...) \ + hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__) + +/* + * Define HFI1_EARLY_DBG at compile time or here to enable early trace + * messages. Do not check in an enablement for this. + */ + +#ifdef HFI1_EARLY_DBG +#define hfi1_dbg_early(fmt, ...) \ + trace_printk(fmt, ##__VA_ARGS__) +#else +#define hfi1_dbg_early(fmt, ...) +#endif + +#endif /* __HFI1_TRACE_EXTRA_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_dbg +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h new file mode 100644 index 000000000000..c3e41aed0034 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h @@ -0,0 +1,209 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#if !defined(__HFI1_TRACE_IBHDRS_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_IBHDRS_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "hfi.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_ibhdrs + +u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr); +const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs); + +#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs) + +#define lrh_name(lrh) { HFI1_##lrh, #lrh } +#define show_lnh(lrh) \ +__print_symbolic(lrh, \ + lrh_name(LRH_BTH), \ + lrh_name(LRH_GRH)) + +#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x" +#define BTH_PRN \ + "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \ + "f %d b %d qpn 0x%.6x a %d psn 0x%.8x" +#define EHDR_PRN "%s" + +DECLARE_EVENT_CLASS(hfi1_ibhdr_template, + TP_PROTO(struct hfi1_devdata *dd, + struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + /* LRH */ + __field(u8, vl) + __field(u8, lver) + __field(u8, sl) + __field(u8, lnh) + __field(u16, dlid) + __field(u16, len) + __field(u16, slid) + /* BTH */ + __field(u8, opcode) + __field(u8, se) + __field(u8, m) + __field(u8, pad) + __field(u8, tver) + __field(u16, pkey) + __field(u8, f) + __field(u8, b) + __field(u32, qpn) + __field(u8, a) + __field(u32, psn) + /* extended headers */ + __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr)) + ), + TP_fast_assign( + struct hfi1_other_headers *ohdr; + + DD_DEV_ASSIGN(dd); + /* LRH */ + __entry->vl = + (u8)(be16_to_cpu(hdr->lrh[0]) >> 12); + __entry->lver = + (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf; + __entry->sl = + (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; + __entry->lnh = + (u8)(be16_to_cpu(hdr->lrh[0]) & 3); + __entry->dlid = + be16_to_cpu(hdr->lrh[1]); + /* allow for larger len */ + __entry->len = + be16_to_cpu(hdr->lrh[2]); + __entry->slid = + be16_to_cpu(hdr->lrh[3]); + /* BTH */ + if (__entry->lnh == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + __entry->opcode = + (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + __entry->se = + (be32_to_cpu(ohdr->bth[0]) >> 23) & 1; + __entry->m = + (be32_to_cpu(ohdr->bth[0]) >> 22) & 1; + __entry->pad = + (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + __entry->tver = + (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf; + __entry->pkey = + be32_to_cpu(ohdr->bth[0]) & 0xffff; + __entry->f = + (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) & + HFI1_FECN_MASK; + __entry->b = + (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) & + HFI1_BECN_MASK; + __entry->qpn = + be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; + __entry->a = + (be32_to_cpu(ohdr->bth[2]) >> 31) & 1; + /* allow for larger PSN */ + __entry->psn = + be32_to_cpu(ohdr->bth[2]) & 0x7fffffff; + /* extended headers */ + memcpy(__get_dynamic_array(ehdrs), &ohdr->u, + ibhdr_exhdr_len(hdr)); + ), + TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN, + __get_str(dev), + /* LRH */ + __entry->vl, + __entry->lver, + __entry->sl, + __entry->lnh, show_lnh(__entry->lnh), + __entry->dlid, + __entry->len, + __entry->slid, + /* BTH */ + __entry->opcode, show_ib_opcode(__entry->opcode), + __entry->se, + __entry->m, + __entry->pad, + __entry->tver, + __entry->pkey, + __entry->f, + __entry->b, + __entry->qpn, + __entry->a, + __entry->psn, + /* extended headers */ + __parse_ib_ehdrs( + __entry->opcode, + (void *)__get_dynamic_array(ehdrs)) + ) +); + +DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr)); + +DEFINE_EVENT(hfi1_ibhdr_template, pio_output_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr)); + +DEFINE_EVENT(hfi1_ibhdr_template, ack_output_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr)); + +DEFINE_EVENT(hfi1_ibhdr_template, sdma_output_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr)); + +#endif /* __HFI1_TRACE_IBHDRS_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_ibhdrs +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/trace_misc.h b/drivers/infiniband/hw/hfi1/trace_misc.h new file mode 100644 index 000000000000..d308454af7fd --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_misc.h @@ -0,0 +1,81 @@ +/* +* Copyright(c) 2015, 2016 Intel Corporation. +* +* This file is provided under a dual BSD/GPLv2 license. When using or +* redistributing this file, you may do so under either license. +* +* GPL LICENSE SUMMARY +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of version 2 of the GNU General Public License as +* published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* BSD LICENSE +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of Intel Corporation nor the names of its +* contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ +#if !defined(__HFI1_TRACE_MISC_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_MISC_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "hfi.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_misc + +TRACE_EVENT(hfi1_interrupt, + TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry, + int src), + TP_ARGS(dd, is_entry, src), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __array(char, buf, 64) + __field(int, src) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd) + is_entry->is_name(__entry->buf, 64, + src - is_entry->start); + __entry->src = src; + ), + TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf, + __entry->src) +); + +#endif /* __HFI1_TRACE_MISC_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_misc +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/trace_rc.h b/drivers/infiniband/hw/hfi1/trace_rc.h new file mode 100644 index 000000000000..5ea5005f9f41 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_rc.h @@ -0,0 +1,123 @@ +/* +* Copyright(c) 2015, 2016 Intel Corporation. +* +* This file is provided under a dual BSD/GPLv2 license. When using or +* redistributing this file, you may do so under either license. +* +* GPL LICENSE SUMMARY +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of version 2 of the GNU General Public License as +* published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* BSD LICENSE +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of Intel Corporation nor the names of its +* contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ +#if !defined(__HFI1_TRACE_RC_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_RC_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "hfi.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_rc + +DECLARE_EVENT_CLASS(hfi1_rc_template, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, s_flags) + __field(u32, psn) + __field(u32, s_psn) + __field(u32, s_next_psn) + __field(u32, s_sending_psn) + __field(u32, s_sending_hpsn) + __field(u32, r_psn) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->qpn = qp->ibqp.qp_num; + __entry->s_flags = qp->s_flags; + __entry->psn = psn; + __entry->s_psn = qp->s_psn; + __entry->s_next_psn = qp->s_next_psn; + __entry->s_sending_psn = qp->s_sending_psn; + __entry->s_sending_hpsn = qp->s_sending_hpsn; + __entry->r_psn = qp->r_psn; + ), + TP_printk( + "[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x", + __get_str(dev), + __entry->qpn, + __entry->s_flags, + __entry->psn, + __entry->s_psn, + __entry->s_next_psn, + __entry->s_sending_psn, + __entry->s_sending_hpsn, + __entry->r_psn + ) +); + +DEFINE_EVENT(hfi1_rc_template, hfi1_sendcomplete, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(hfi1_rc_template, hfi1_ack, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(hfi1_rc_template, hfi1_timeout, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(hfi1_rc_template, hfi1_rcv_error, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +#endif /* __HFI1_TRACE_RC_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_rc +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h new file mode 100644 index 000000000000..9ba1f615ec95 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_rx.h @@ -0,0 +1,322 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#if !defined(__HFI1_TRACE_RX_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_RX_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "hfi.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_rx + +TRACE_EVENT(hfi1_rcvhdr, + TP_PROTO(struct hfi1_devdata *dd, + u32 ctxt, + u64 eflags, + u32 etype, + u32 hlen, + u32 tlen, + u32 updegr, + u32 etail + ), + TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u64, eflags) + __field(u32, ctxt) + __field(u32, etype) + __field(u32, hlen) + __field(u32, tlen) + __field(u32, updegr) + __field(u32, etail) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->eflags = eflags; + __entry->ctxt = ctxt; + __entry->etype = etype; + __entry->hlen = hlen; + __entry->tlen = tlen; + __entry->updegr = updegr; + __entry->etail = etail; + ), + TP_printk( + "[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d", + __get_str(dev), + __entry->ctxt, + __entry->eflags, + __entry->etype, show_packettype(__entry->etype), + __entry->hlen, + __entry->tlen, + __entry->updegr, + __entry->etail + ) +); + +TRACE_EVENT(hfi1_receive_interrupt, + TP_PROTO(struct hfi1_devdata *dd, u32 ctxt), + TP_ARGS(dd, ctxt), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u32, ctxt) + __field(u8, slow_path) + __field(u8, dma_rtail) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + if (dd->rcd[ctxt]->do_interrupt == + &handle_receive_interrupt) { + __entry->slow_path = 1; + __entry->dma_rtail = 0xFF; + } else if (dd->rcd[ctxt]->do_interrupt == + &handle_receive_interrupt_dma_rtail){ + __entry->dma_rtail = 1; + __entry->slow_path = 0; + } else if (dd->rcd[ctxt]->do_interrupt == + &handle_receive_interrupt_nodma_rtail) { + __entry->dma_rtail = 0; + __entry->slow_path = 0; + } + ), + TP_printk("[%s] ctxt %d SlowPath: %d DmaRtail: %d", + __get_str(dev), + __entry->ctxt, + __entry->slow_path, + __entry->dma_rtail + ) +); + +TRACE_EVENT(hfi1_exp_tid_reg, + TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, + u32 npages, unsigned long va, unsigned long pa, + dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma), + TP_STRUCT__entry( + __field(unsigned int, ctxt) + __field(u16, subctxt) + __field(u32, rarr) + __field(u32, npages) + __field(unsigned long, va) + __field(unsigned long, pa) + __field(dma_addr_t, dma) + ), + TP_fast_assign( + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->rarr = rarr; + __entry->npages = npages; + __entry->va = va; + __entry->pa = pa; + __entry->dma = dma; + ), + TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx", + __entry->ctxt, + __entry->subctxt, + __entry->rarr, + __entry->npages, + __entry->pa, + __entry->va, + __entry->dma + ) + ); + +TRACE_EVENT(hfi1_exp_tid_unreg, + TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, + unsigned long va, unsigned long pa, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma), + TP_STRUCT__entry( + __field(unsigned int, ctxt) + __field(u16, subctxt) + __field(u32, rarr) + __field(u32, npages) + __field(unsigned long, va) + __field(unsigned long, pa) + __field(dma_addr_t, dma) + ), + TP_fast_assign( + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->rarr = rarr; + __entry->npages = npages; + __entry->va = va; + __entry->pa = pa; + __entry->dma = dma; + ), + TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx", + __entry->ctxt, + __entry->subctxt, + __entry->rarr, + __entry->npages, + __entry->pa, + __entry->va, + __entry->dma + ) + ); + +TRACE_EVENT(hfi1_exp_tid_inval, + TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr, + u32 npages, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, va, rarr, npages, dma), + TP_STRUCT__entry( + __field(unsigned int, ctxt) + __field(u16, subctxt) + __field(unsigned long, va) + __field(u32, rarr) + __field(u32, npages) + __field(dma_addr_t, dma) + ), + TP_fast_assign( + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->va = va; + __entry->rarr = rarr; + __entry->npages = npages; + __entry->dma = dma; + ), + TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx", + __entry->ctxt, + __entry->subctxt, + __entry->rarr, + __entry->npages, + __entry->va, + __entry->dma + ) + ); + +TRACE_EVENT(hfi1_mmu_invalidate, + TP_PROTO(unsigned int ctxt, u16 subctxt, const char *type, + unsigned long start, unsigned long end), + TP_ARGS(ctxt, subctxt, type, start, end), + TP_STRUCT__entry( + __field(unsigned int, ctxt) + __field(u16, subctxt) + __string(type, type) + __field(unsigned long, start) + __field(unsigned long, end) + ), + TP_fast_assign( + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __assign_str(type, type); + __entry->start = start; + __entry->end = end; + ), + TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx", + __entry->ctxt, + __entry->subctxt, + __get_str(type), + __entry->start, + __entry->end + ) + ); + +#define SNOOP_PRN \ + "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \ + "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]" + +TRACE_EVENT(snoop_capture, + TP_PROTO(struct hfi1_devdata *dd, + int hdr_len, + struct hfi1_ib_header *hdr, + int data_len, + void *data), + TP_ARGS(dd, hdr_len, hdr, data_len, data), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, slid) + __field(u16, dlid) + __field(u32, qpn) + __field(u8, opcode) + __field(u8, sl) + __field(u16, pkey) + __field(u32, hdr_len) + __field(u32, data_len) + __field(u8, lnh) + __dynamic_array(u8, raw_hdr, hdr_len) + __dynamic_array(u8, raw_pkt, data_len) + ), + TP_fast_assign( + struct hfi1_other_headers *ohdr; + + __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3); + if (__entry->lnh == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + DD_DEV_ASSIGN(dd); + __entry->slid = be16_to_cpu(hdr->lrh[3]); + __entry->dlid = be16_to_cpu(hdr->lrh[1]); + __entry->qpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; + __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; + __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff; + __entry->hdr_len = hdr_len; + __entry->data_len = data_len; + memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len); + memcpy(__get_dynamic_array(raw_pkt), data, data_len); + ), + TP_printk( + "[%s] " SNOOP_PRN, + __get_str(dev), + __entry->slid, + __entry->dlid, + __entry->qpn, + __entry->opcode, + show_ib_opcode(__entry->opcode), + __entry->sl, + __entry->pkey, + __entry->hdr_len, + __entry->data_len + ) +); + +#endif /* __HFI1_TRACE_RX_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_rx +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h new file mode 100644 index 000000000000..415d6be42c5d --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_tx.h @@ -0,0 +1,642 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#if !defined(__HFI1_TRACE_TX_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_TX_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "hfi.h" +#include "mad.h" +#include "sdma.h" + +const char *parse_sdma_flags(struct trace_seq *p, u64 desc0, u64 desc1); + +#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_tx + +TRACE_EVENT(hfi1_piofree, + TP_PROTO(struct send_context *sc, int extra), + TP_ARGS(sc, extra), + TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd) + __field(u32, sw_index) + __field(u32, hw_context) + __field(int, extra) + ), + TP_fast_assign(DD_DEV_ASSIGN(sc->dd); + __entry->sw_index = sc->sw_index; + __entry->hw_context = sc->hw_context; + __entry->extra = extra; + ), + TP_printk("[%s] ctxt %u(%u) extra %d", + __get_str(dev), + __entry->sw_index, + __entry->hw_context, + __entry->extra + ) +); + +TRACE_EVENT(hfi1_wantpiointr, + TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl), + TP_ARGS(sc, needint, credit_ctrl), + TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd) + __field(u32, sw_index) + __field(u32, hw_context) + __field(u32, needint) + __field(u64, credit_ctrl) + ), + TP_fast_assign(DD_DEV_ASSIGN(sc->dd); + __entry->sw_index = sc->sw_index; + __entry->hw_context = sc->hw_context; + __entry->needint = needint; + __entry->credit_ctrl = credit_ctrl; + ), + TP_printk("[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx", + __get_str(dev), + __entry->sw_index, + __entry->hw_context, + __entry->needint, + (unsigned long long)__entry->credit_ctrl + ) +); + +DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template, + TP_PROTO(struct rvt_qp *qp, u32 flags), + TP_ARGS(qp, flags), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, flags) + __field(u32, s_flags) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->flags = flags; + __entry->qpn = qp->ibqp.qp_num; + __entry->s_flags = qp->s_flags; + ), + TP_printk( + "[%s] qpn 0x%x flags 0x%x s_flags 0x%x", + __get_str(dev), + __entry->qpn, + __entry->flags, + __entry->s_flags + ) +); + +DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup, + TP_PROTO(struct rvt_qp *qp, u32 flags), + TP_ARGS(qp, flags)); + +DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep, + TP_PROTO(struct rvt_qp *qp, u32 flags), + TP_ARGS(qp, flags)); + +TRACE_EVENT(hfi1_sdma_descriptor, + TP_PROTO(struct sdma_engine *sde, + u64 desc0, + u64 desc1, + u16 e, + void *descp), + TP_ARGS(sde, desc0, desc1, e, descp), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(void *, descp) + __field(u64, desc0) + __field(u64, desc1) + __field(u16, e) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->desc0 = desc0; + __entry->desc1 = desc1; + __entry->idx = sde->this_idx; + __entry->descp = descp; + __entry->e = e; + ), + TP_printk( + "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u", + __get_str(dev), + __entry->idx, + __parse_sdma_flags(__entry->desc0, __entry->desc1), + (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) & + SDMA_DESC0_PHY_ADDR_MASK, + (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) & + SDMA_DESC1_GENERATION_MASK), + (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) & + SDMA_DESC0_BYTE_COUNT_MASK), + __entry->desc0, + __entry->desc1, + __entry->descp, + __entry->e + ) +); + +TRACE_EVENT(hfi1_sdma_engine_select, + TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx), + TP_ARGS(dd, sel, vl, idx), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u32, sel) + __field(u8, vl) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->sel = sel; + __entry->vl = vl; + __entry->idx = idx; + ), + TP_printk("[%s] selecting SDE %u sel 0x%x vl %u", + __get_str(dev), + __entry->idx, + __entry->sel, + __entry->vl + ) +); + +DECLARE_EVENT_CLASS(hfi1_sdma_engine_class, + TP_PROTO(struct sdma_engine *sde, u64 status), + TP_ARGS(sde, status), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u64, status) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->status = status; + __entry->idx = sde->this_idx; + ), + TP_printk("[%s] SDE(%u) status %llx", + __get_str(dev), + __entry->idx, + (unsigned long long)__entry->status + ) +); + +DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt, + TP_PROTO(struct sdma_engine *sde, u64 status), + TP_ARGS(sde, status) +); + +DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress, + TP_PROTO(struct sdma_engine *sde, u64 status), + TP_ARGS(sde, status) +); + +DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad, + TP_PROTO(struct sdma_engine *sde, int aidx), + TP_ARGS(sde, aidx), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(int, aidx) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->idx = sde->this_idx; + __entry->aidx = aidx; + ), + TP_printk("[%s] SDE(%u) aidx %d", + __get_str(dev), + __entry->idx, + __entry->aidx + ) +); + +DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate, + TP_PROTO(struct sdma_engine *sde, int aidx), + TP_ARGS(sde, aidx)); + +DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate, + TP_PROTO(struct sdma_engine *sde, int aidx), + TP_ARGS(sde, aidx)); + +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER +TRACE_EVENT(hfi1_sdma_progress, + TP_PROTO(struct sdma_engine *sde, + u16 hwhead, + u16 swhead, + struct sdma_txreq *txp + ), + TP_ARGS(sde, hwhead, swhead, txp), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u64, sn) + __field(u16, hwhead) + __field(u16, swhead) + __field(u16, txnext) + __field(u16, tx_tail) + __field(u16, tx_head) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->hwhead = hwhead; + __entry->swhead = swhead; + __entry->tx_tail = sde->tx_tail; + __entry->tx_head = sde->tx_head; + __entry->txnext = txp ? txp->next_descq_idx : ~0; + __entry->idx = sde->this_idx; + __entry->sn = txp ? txp->sn : ~0; + ), + TP_printk( + "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u", + __get_str(dev), + __entry->idx, + __entry->sn, + __entry->hwhead, + __entry->swhead, + __entry->txnext, + __entry->tx_head, + __entry->tx_tail + ) +); +#else +TRACE_EVENT(hfi1_sdma_progress, + TP_PROTO(struct sdma_engine *sde, + u16 hwhead, u16 swhead, + struct sdma_txreq *txp + ), + TP_ARGS(sde, hwhead, swhead, txp), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u16, hwhead) + __field(u16, swhead) + __field(u16, txnext) + __field(u16, tx_tail) + __field(u16, tx_head) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->hwhead = hwhead; + __entry->swhead = swhead; + __entry->tx_tail = sde->tx_tail; + __entry->tx_head = sde->tx_head; + __entry->txnext = txp ? txp->next_descq_idx : ~0; + __entry->idx = sde->this_idx; + ), + TP_printk( + "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u", + __get_str(dev), + __entry->idx, + __entry->hwhead, + __entry->swhead, + __entry->txnext, + __entry->tx_head, + __entry->tx_tail + ) +); +#endif + +DECLARE_EVENT_CLASS(hfi1_sdma_sn, + TP_PROTO(struct sdma_engine *sde, u64 sn), + TP_ARGS(sde, sn), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u64, sn) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->sn = sn; + __entry->idx = sde->this_idx; + ), + TP_printk("[%s] SDE(%u) sn %llu", + __get_str(dev), + __entry->idx, + __entry->sn + ) +); + +DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn, + TP_PROTO( + struct sdma_engine *sde, + u64 sn + ), + TP_ARGS(sde, sn) +); + +DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn, + TP_PROTO(struct sdma_engine *sde, u64 sn), + TP_ARGS(sde, sn) +); + +#define USDMA_HDR_FORMAT \ + "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x" + +TRACE_EVENT(hfi1_sdma_user_header, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req, + struct hfi1_pkt_header *hdr, u32 tidval), + TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u8, subctxt) + __field(u16, req) + __field(u32, pbc0) + __field(u32, pbc1) + __field(u32, lrh0) + __field(u32, lrh1) + __field(u32, bth0) + __field(u32, bth1) + __field(u32, bth2) + __field(u32, kdeth0) + __field(u32, kdeth1) + __field(u32, kdeth2) + __field(u32, kdeth3) + __field(u32, kdeth4) + __field(u32, kdeth5) + __field(u32, kdeth6) + __field(u32, kdeth7) + __field(u32, kdeth8) + __field(u32, tidval) + ), + TP_fast_assign( + __le32 *pbc = (__le32 *)hdr->pbc; + __be32 *lrh = (__be32 *)hdr->lrh; + __be32 *bth = (__be32 *)hdr->bth; + __le32 *kdeth = (__le32 *)&hdr->kdeth; + + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->req = req; + __entry->pbc0 = le32_to_cpu(pbc[0]); + __entry->pbc1 = le32_to_cpu(pbc[1]); + __entry->lrh0 = be32_to_cpu(lrh[0]); + __entry->lrh1 = be32_to_cpu(lrh[1]); + __entry->bth0 = be32_to_cpu(bth[0]); + __entry->bth1 = be32_to_cpu(bth[1]); + __entry->bth2 = be32_to_cpu(bth[2]); + __entry->kdeth0 = le32_to_cpu(kdeth[0]); + __entry->kdeth1 = le32_to_cpu(kdeth[1]); + __entry->kdeth2 = le32_to_cpu(kdeth[2]); + __entry->kdeth3 = le32_to_cpu(kdeth[3]); + __entry->kdeth4 = le32_to_cpu(kdeth[4]); + __entry->kdeth5 = le32_to_cpu(kdeth[5]); + __entry->kdeth6 = le32_to_cpu(kdeth[6]); + __entry->kdeth7 = le32_to_cpu(kdeth[7]); + __entry->kdeth8 = le32_to_cpu(kdeth[8]); + __entry->tidval = tidval; + ), + TP_printk(USDMA_HDR_FORMAT, + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->req, + __entry->pbc1, + __entry->pbc0, + __entry->lrh0, + __entry->lrh1, + __entry->bth0, + __entry->bth1, + __entry->bth2, + __entry->kdeth0, + __entry->kdeth1, + __entry->kdeth2, + __entry->kdeth3, + __entry->kdeth4, + __entry->kdeth5, + __entry->kdeth6, + __entry->kdeth7, + __entry->kdeth8, + __entry->tidval + ) +); + +#define SDMA_UREQ_FMT \ + "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u" +TRACE_EVENT(hfi1_sdma_user_reqinfo, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i), + TP_ARGS(dd, ctxt, subctxt, i), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd); + __field(u16, ctxt) + __field(u8, subctxt) + __field(u8, ver_opcode) + __field(u8, iovcnt) + __field(u16, npkts) + __field(u16, fragsize) + __field(u16, comp_idx) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->ver_opcode = i[0] & 0xff; + __entry->iovcnt = (i[0] >> 8) & 0xff; + __entry->npkts = i[1]; + __entry->fragsize = i[2]; + __entry->comp_idx = i[3]; + ), + TP_printk(SDMA_UREQ_FMT, + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->ver_opcode, + __entry->iovcnt, + __entry->npkts, + __entry->fragsize, + __entry->comp_idx + ) +); + +#define usdma_complete_name(st) { st, #st } +#define show_usdma_complete_state(st) \ + __print_symbolic(st, \ + usdma_complete_name(FREE), \ + usdma_complete_name(QUEUED), \ + usdma_complete_name(COMPLETE), \ + usdma_complete_name(ERROR)) + +TRACE_EVENT(hfi1_sdma_user_completion, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx, + u8 state, int code), + TP_ARGS(dd, ctxt, subctxt, idx, state, code), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u8, subctxt) + __field(u16, idx) + __field(u8, state) + __field(int, code) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->idx = idx; + __entry->state = state; + __entry->code = code; + ), + TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)", + __get_str(dev), __entry->ctxt, __entry->subctxt, + __entry->idx, show_usdma_complete_state(__entry->state), + __entry->code) +); + +const char *print_u32_array(struct trace_seq *, u32 *, int); +#define __print_u32_hex(arr, len) print_u32_array(p, arr, len) + +TRACE_EVENT(hfi1_sdma_user_header_ahg, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req, + u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval), + TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u8, subctxt) + __field(u16, req) + __field(u8, sde) + __field(u8, idx) + __field(int, len) + __field(u32, tidval) + __array(u32, ahg, 10) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->req = req; + __entry->sde = sde; + __entry->idx = ahgidx; + __entry->len = len; + __entry->tidval = tidval; + memcpy(__entry->ahg, ahg, len * sizeof(u32)); + ), + TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x", + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->req, + __entry->sde, + __entry->idx, + __entry->len - 1, + __print_u32_hex(__entry->ahg, __entry->len), + __entry->tidval + ) +); + +TRACE_EVENT(hfi1_sdma_state, + TP_PROTO(struct sdma_engine *sde, + const char *cstate, + const char *nstate + ), + TP_ARGS(sde, cstate, nstate), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __string(curstate, cstate) + __string(newstate, nstate) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __assign_str(curstate, cstate); + __assign_str(newstate, nstate); + ), + TP_printk("[%s] current state %s new state %s", + __get_str(dev), + __get_str(curstate), + __get_str(newstate) + ) +); + +#define BCT_FORMAT \ + "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]" + +#define BCT(field) \ + be16_to_cpu( \ + ((struct buffer_control *)__get_dynamic_array(bct))->field \ + ) + +DECLARE_EVENT_CLASS(hfi1_bct_template, + TP_PROTO(struct hfi1_devdata *dd, + struct buffer_control *bc), + TP_ARGS(dd, bc), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __dynamic_array(u8, bct, sizeof(*bc)) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + memcpy(__get_dynamic_array(bct), bc, + sizeof(*bc)); + ), + TP_printk(BCT_FORMAT, + BCT(overall_shared_limit), + + BCT(vl[0].dedicated), + BCT(vl[0].shared), + + BCT(vl[1].dedicated), + BCT(vl[1].shared), + + BCT(vl[2].dedicated), + BCT(vl[2].shared), + + BCT(vl[3].dedicated), + BCT(vl[3].shared), + + BCT(vl[4].dedicated), + BCT(vl[4].shared), + + BCT(vl[5].dedicated), + BCT(vl[5].shared), + + BCT(vl[6].dedicated), + BCT(vl[6].shared), + + BCT(vl[7].dedicated), + BCT(vl[7].shared), + + BCT(vl[15].dedicated), + BCT(vl[15].shared) + ) +); + +DEFINE_EVENT(hfi1_bct_template, bct_set, + TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc), + TP_ARGS(dd, bc)); + +DEFINE_EVENT(hfi1_bct_template, bct_get, + TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc), + TP_ARGS(dd, bc)); + +#endif /* __HFI1_TRACE_TX_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_tx +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/twsi.c b/drivers/infiniband/hw/hfi1/twsi.c deleted file mode 100644 index e82e52a63d35..000000000000 --- a/drivers/infiniband/hw/hfi1/twsi.c +++ /dev/null @@ -1,489 +0,0 @@ -/* - * Copyright(c) 2015, 2016 Intel Corporation. - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include <linux/delay.h> -#include <linux/pci.h> -#include <linux/vmalloc.h> - -#include "hfi.h" -#include "twsi.h" - -/* - * "Two Wire Serial Interface" support. - * - * Originally written for a not-quite-i2c serial eeprom, which is - * still used on some supported boards. Later boards have added a - * variety of other uses, most board-specific, so the bit-boffing - * part has been split off to this file, while the other parts - * have been moved to chip-specific files. - * - * We have also dropped all pretense of fully generic (e.g. pretend - * we don't know whether '1' is the higher voltage) interface, as - * the restrictions of the generic i2c interface (e.g. no access from - * driver itself) make it unsuitable for this use. - */ - -#define READ_CMD 1 -#define WRITE_CMD 0 - -/** - * i2c_wait_for_writes - wait for a write - * @dd: the hfi1_ib device - * - * We use this instead of udelay directly, so we can make sure - * that previous register writes have been flushed all the way - * to the chip. Since we are delaying anyway, the cost doesn't - * hurt, and makes the bit twiddling more regular - */ -static void i2c_wait_for_writes(struct hfi1_devdata *dd, u32 target) -{ - /* - * implicit read of EXTStatus is as good as explicit - * read of scratch, if all we want to do is flush - * writes. - */ - hfi1_gpio_mod(dd, target, 0, 0, 0); - rmb(); /* inlined, so prevent compiler reordering */ -} - -/* - * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that - * for "almost compliant" modules - */ -#define SCL_WAIT_USEC 1000 - -/* BUF_WAIT is time bus must be free between STOP or ACK and to next START. - * Should be 20, but some chips need more. - */ -#define TWSI_BUF_WAIT_USEC 60 - -static void scl_out(struct hfi1_devdata *dd, u32 target, u8 bit) -{ - u32 mask; - - udelay(1); - - mask = QSFP_HFI0_I2CCLK; - - /* SCL is meant to be bare-drain, so never set "OUT", just DIR */ - hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask); - - /* - * Allow for slow slaves by simple - * delay for falling edge, sampling on rise. - */ - if (!bit) { - udelay(2); - } else { - int rise_usec; - - for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) { - if (mask & hfi1_gpio_mod(dd, target, 0, 0, 0)) - break; - udelay(2); - } - if (rise_usec <= 0) - dd_dev_err(dd, "SCL interface stuck low > %d uSec\n", - SCL_WAIT_USEC); - } - i2c_wait_for_writes(dd, target); -} - -static u8 scl_in(struct hfi1_devdata *dd, u32 target, int wait) -{ - u32 read_val, mask; - - mask = QSFP_HFI0_I2CCLK; - /* SCL is meant to be bare-drain, so never set "OUT", just DIR */ - hfi1_gpio_mod(dd, target, 0, 0, mask); - read_val = hfi1_gpio_mod(dd, target, 0, 0, 0); - if (wait) - i2c_wait_for_writes(dd, target); - return (read_val & mask) >> GPIO_SCL_NUM; -} - -static void sda_out(struct hfi1_devdata *dd, u32 target, u8 bit) -{ - u32 mask; - - mask = QSFP_HFI0_I2CDAT; - - /* SDA is meant to be bare-drain, so never set "OUT", just DIR */ - hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask); - - i2c_wait_for_writes(dd, target); - udelay(2); -} - -static u8 sda_in(struct hfi1_devdata *dd, u32 target, int wait) -{ - u32 read_val, mask; - - mask = QSFP_HFI0_I2CDAT; - /* SDA is meant to be bare-drain, so never set "OUT", just DIR */ - hfi1_gpio_mod(dd, target, 0, 0, mask); - read_val = hfi1_gpio_mod(dd, target, 0, 0, 0); - if (wait) - i2c_wait_for_writes(dd, target); - return (read_val & mask) >> GPIO_SDA_NUM; -} - -/** - * i2c_ackrcv - see if ack following write is true - * @dd: the hfi1_ib device - */ -static int i2c_ackrcv(struct hfi1_devdata *dd, u32 target) -{ - u8 ack_received; - - /* AT ENTRY SCL = LOW */ - /* change direction, ignore data */ - ack_received = sda_in(dd, target, 1); - scl_out(dd, target, 1); - ack_received = sda_in(dd, target, 1) == 0; - scl_out(dd, target, 0); - return ack_received; -} - -static void stop_cmd(struct hfi1_devdata *dd, u32 target); - -/** - * rd_byte - read a byte, sending STOP on last, else ACK - * @dd: the hfi1_ib device - * - * Returns byte shifted out of device - */ -static int rd_byte(struct hfi1_devdata *dd, u32 target, int last) -{ - int bit_cntr, data; - - data = 0; - - for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) { - data <<= 1; - scl_out(dd, target, 1); - data |= sda_in(dd, target, 0); - scl_out(dd, target, 0); - } - if (last) { - scl_out(dd, target, 1); - stop_cmd(dd, target); - } else { - sda_out(dd, target, 0); - scl_out(dd, target, 1); - scl_out(dd, target, 0); - sda_out(dd, target, 1); - } - return data; -} - -/** - * wr_byte - write a byte, one bit at a time - * @dd: the hfi1_ib device - * @data: the byte to write - * - * Returns 0 if we got the following ack, otherwise 1 - */ -static int wr_byte(struct hfi1_devdata *dd, u32 target, u8 data) -{ - int bit_cntr; - u8 bit; - - for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) { - bit = (data >> bit_cntr) & 1; - sda_out(dd, target, bit); - scl_out(dd, target, 1); - scl_out(dd, target, 0); - } - return (!i2c_ackrcv(dd, target)) ? 1 : 0; -} - -/* - * issue TWSI start sequence: - * (both clock/data high, clock high, data low while clock is high) - */ -static void start_seq(struct hfi1_devdata *dd, u32 target) -{ - sda_out(dd, target, 1); - scl_out(dd, target, 1); - sda_out(dd, target, 0); - udelay(1); - scl_out(dd, target, 0); -} - -/** - * stop_seq - transmit the stop sequence - * @dd: the hfi1_ib device - * - * (both clock/data low, clock high, data high while clock is high) - */ -static void stop_seq(struct hfi1_devdata *dd, u32 target) -{ - scl_out(dd, target, 0); - sda_out(dd, target, 0); - scl_out(dd, target, 1); - sda_out(dd, target, 1); -} - -/** - * stop_cmd - transmit the stop condition - * @dd: the hfi1_ib device - * - * (both clock/data low, clock high, data high while clock is high) - */ -static void stop_cmd(struct hfi1_devdata *dd, u32 target) -{ - stop_seq(dd, target); - udelay(TWSI_BUF_WAIT_USEC); -} - -/** - * hfi1_twsi_reset - reset I2C communication - * @dd: the hfi1_ib device - * returns 0 if ok, -EIO on error - */ -int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target) -{ - int clock_cycles_left = 9; - u32 mask; - - /* Both SCL and SDA should be high. If not, there - * is something wrong. - */ - mask = QSFP_HFI0_I2CCLK | QSFP_HFI0_I2CDAT; - - /* - * Force pins to desired innocuous state. - * This is the default power-on state with out=0 and dir=0, - * So tri-stated and should be floating high (barring HW problems) - */ - hfi1_gpio_mod(dd, target, 0, 0, mask); - - /* Check if SCL is low, if it is low then we have a slave device - * misbehaving and there is not much we can do. - */ - if (!scl_in(dd, target, 0)) - return -EIO; - - /* Check if SDA is low, if it is low then we have to clock SDA - * up to 9 times for the device to release the bus - */ - while (clock_cycles_left--) { - if (sda_in(dd, target, 0)) - return 0; - scl_out(dd, target, 0); - scl_out(dd, target, 1); - } - - return -EIO; -} - -#define HFI1_TWSI_START 0x100 -#define HFI1_TWSI_STOP 0x200 - -/* Write byte to TWSI, optionally prefixed with START or suffixed with - * STOP. - * returns 0 if OK (ACK received), else != 0 - */ -static int twsi_wr(struct hfi1_devdata *dd, u32 target, int data, int flags) -{ - int ret = 1; - - if (flags & HFI1_TWSI_START) - start_seq(dd, target); - - /* Leaves SCL low (from i2c_ackrcv()) */ - ret = wr_byte(dd, target, data); - - if (flags & HFI1_TWSI_STOP) - stop_cmd(dd, target); - return ret; -} - -/* Added functionality for IBA7220-based cards */ -#define HFI1_TEMP_DEV 0x98 - -/* - * hfi1_twsi_blk_rd - * General interface for data transfer from twsi devices. - * One vestige of its former role is that it recognizes a device - * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part, - * which responded to all TWSI device codes, interpreting them as - * address within device. On all other devices found on board handled by - * this driver, the device is followed by a N-byte "address" which selects - * the "register" or "offset" within the device from which data should - * be read. - */ -int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr, - void *buffer, int len) -{ - u8 *bp = buffer; - int ret = 1; - int i; - int offset_size; - - /* obtain the offset size, strip it from the device address */ - offset_size = (dev >> 8) & 0xff; - dev &= 0xff; - - /* allow at most a 2 byte offset */ - if (offset_size > 2) - goto bail; - - if (dev == HFI1_TWSI_NO_DEV) { - /* legacy not-really-I2C */ - addr = (addr << 1) | READ_CMD; - ret = twsi_wr(dd, target, addr, HFI1_TWSI_START); - } else { - /* Actual I2C */ - if (offset_size) { - ret = twsi_wr(dd, target, - dev | WRITE_CMD, HFI1_TWSI_START); - if (ret) { - stop_cmd(dd, target); - goto bail; - } - - for (i = 0; i < offset_size; i++) { - ret = twsi_wr(dd, target, - (addr >> (i * 8)) & 0xff, 0); - udelay(TWSI_BUF_WAIT_USEC); - if (ret) { - dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n", - i, addr); - goto bail; - } - } - } - ret = twsi_wr(dd, target, dev | READ_CMD, HFI1_TWSI_START); - } - if (ret) { - stop_cmd(dd, target); - goto bail; - } - - /* - * block devices keeps clocking data out as long as we ack, - * automatically incrementing the address. Some have "pages" - * whose boundaries will not be crossed, but the handling - * of these is left to the caller, who is in a better - * position to know. - */ - while (len-- > 0) { - /* - * Get and store data, sending ACK if length remaining, - * else STOP - */ - *bp++ = rd_byte(dd, target, !len); - } - - ret = 0; - -bail: - return ret; -} - -/* - * hfi1_twsi_blk_wr - * General interface for data transfer to twsi devices. - * One vestige of its former role is that it recognizes a device - * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part, - * which responded to all TWSI device codes, interpreting them as - * address within device. On all other devices found on board handled by - * this driver, the device is followed by a N-byte "address" which selects - * the "register" or "offset" within the device to which data should - * be written. - */ -int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr, - const void *buffer, int len) -{ - const u8 *bp = buffer; - int ret = 1; - int i; - int offset_size; - - /* obtain the offset size, strip it from the device address */ - offset_size = (dev >> 8) & 0xff; - dev &= 0xff; - - /* allow at most a 2 byte offset */ - if (offset_size > 2) - goto bail; - - if (dev == HFI1_TWSI_NO_DEV) { - if (twsi_wr(dd, target, (addr << 1) | WRITE_CMD, - HFI1_TWSI_START)) { - goto failed_write; - } - } else { - /* Real I2C */ - if (twsi_wr(dd, target, dev | WRITE_CMD, HFI1_TWSI_START)) - goto failed_write; - } - - for (i = 0; i < offset_size; i++) { - ret = twsi_wr(dd, target, (addr >> (i * 8)) & 0xff, 0); - udelay(TWSI_BUF_WAIT_USEC); - if (ret) { - dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n", - i, addr); - goto bail; - } - } - - for (i = 0; i < len; i++) - if (twsi_wr(dd, target, *bp++, 0)) - goto failed_write; - - ret = 0; - -failed_write: - stop_cmd(dd, target); - -bail: - return ret; -} diff --git a/drivers/infiniband/hw/hfi1/twsi.h b/drivers/infiniband/hw/hfi1/twsi.h deleted file mode 100644 index 5b8a5b5e7eae..000000000000 --- a/drivers/infiniband/hw/hfi1/twsi.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef _TWSI_H -#define _TWSI_H -/* - * Copyright(c) 2015, 2016 Intel Corporation. - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#define HFI1_TWSI_NO_DEV 0xFF - -struct hfi1_devdata; - -/* Bit position of SDA/SCL pins in ASIC_QSFP* registers */ -#define GPIO_SDA_NUM 1 -#define GPIO_SCL_NUM 0 - -/* these functions must be called with qsfp_lock held */ -int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target); -int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr, - void *buffer, int len); -int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr, - const void *buffer, int len); - -#endif /* _TWSI_H */ diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index df773d433297..a726d96d185f 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -119,6 +119,31 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto bail; } /* + * Local operations are processed immediately + * after all prior requests have completed. + */ + if (wqe->wr.opcode == IB_WR_REG_MR || + wqe->wr.opcode == IB_WR_LOCAL_INV) { + int local_ops = 0; + int err = 0; + + if (qp->s_last != qp->s_cur) + goto bail; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) { + err = rvt_invalidate_rkey( + qp, wqe->wr.ex.invalidate_rkey); + local_ops = 1; + } + hfi1_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR + : IB_WC_SUCCESS); + if (local_ops) + atomic_dec(&qp->local_ops_pending); + qp->s_hdrwords = 0; + goto done_free_tx; + } + /* * Start a new request. */ qp->s_psn = wqe->psn; @@ -294,46 +319,12 @@ void hfi1_uc_rcv(struct hfi1_packet *packet) struct ib_reth *reth; int has_grh = rcv_flags & HFI1_HAS_GRH; int ret; - u32 bth1; bth0 = be32_to_cpu(ohdr->bth[0]); if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0)) return; - bth1 = be32_to_cpu(ohdr->bth[1]); - if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) { - if (bth1 & HFI1_BECN_SMASK) { - struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - u32 rqpn, lqpn; - u16 rlid = be16_to_cpu(hdr->lrh[3]); - u8 sl, sc5; - - lqpn = bth1 & RVT_QPN_MASK; - rqpn = qp->remote_qpn; - - sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl]; - sl = ibp->sc_to_sl[sc5]; - - process_becn(ppd, sl, rlid, lqpn, rqpn, - IB_CC_SVCTYPE_UC); - } - - if (bth1 & HFI1_FECN_SMASK) { - struct ib_grh *grh = NULL; - u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]); - u16 slid = be16_to_cpu(hdr->lrh[3]); - u16 dlid = be16_to_cpu(hdr->lrh[1]); - u32 src_qp = qp->remote_qpn; - u8 sc5; - - sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl]; - if (has_grh) - grh = &hdr->u.l.grh; - - return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, - grh); - } - } + process_ecn(qp, packet, true); psn = be32_to_cpu(ohdr->bth[2]); opcode = (bth0 >> 24) & 0xff; diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index be91f6fa1c87..f01e8e1d62d3 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -184,8 +184,12 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) } if (ah_attr->ah_flags & IB_AH_GRH) { - hfi1_copy_sge(&qp->r_sge, &ah_attr->grh, - sizeof(struct ib_grh), 1, 0); + struct ib_grh grh; + struct ib_global_route grd = ah_attr->grh; + + hfi1_make_grh(ibp, &grh, &grd, 0, 0); + hfi1_copy_sge(&qp->r_sge, &grh, + sizeof(grh), 1, 0); wc.wc_flags |= IB_WC_GRH; } else { hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1); @@ -430,10 +434,9 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) qp->qkey : wqe->ud_wr.remote_qkey); ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); /* disarm any ahg */ - priv->s_hdr->ahgcount = 0; - priv->s_hdr->ahgidx = 0; - priv->s_hdr->tx_flags = 0; - priv->s_hdr->sde = NULL; + priv->s_ahg->ahgcount = 0; + priv->s_ahg->ahgidx = 0; + priv->s_ahg->tx_flags = 0; /* pbc */ ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2; @@ -665,13 +668,13 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) struct hfi1_other_headers *ohdr = packet->ohdr; int opcode; u32 hdrsize = packet->hlen; - u32 pad; struct ib_wc wc; u32 qkey; u32 src_qp; u16 dlid, pkey; int mgmt_pkey_idx = -1; struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data; + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); struct hfi1_ib_header *hdr = packet->hdr; u32 rcv_flags = packet->rcv_flags; void *data = packet->ebuf; @@ -680,52 +683,33 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) bool has_grh = rcv_flags & HFI1_HAS_GRH; u8 sc5 = hdr2sc((struct hfi1_message_header *)hdr, packet->rhf); u32 bth1; - int is_mcast; - struct ib_grh *grh = NULL; + u8 sl_from_sc, sl; + u16 slid; + u8 extra_bytes; qkey = be32_to_cpu(ohdr->u.ud.deth[0]); src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK; dlid = be16_to_cpu(hdr->lrh[1]); - is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) && - (dlid != be16_to_cpu(IB_LID_PERMISSIVE)); bth1 = be32_to_cpu(ohdr->bth[1]); - if (unlikely(bth1 & HFI1_BECN_SMASK)) { - /* - * In pre-B0 h/w the CNP_OPCODE is handled via an - * error path. - */ - struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - u32 lqpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; - u8 sl; - - sl = ibp->sc_to_sl[sc5]; - - process_becn(ppd, sl, 0, lqpn, 0, IB_CC_SVCTYPE_UD); - } + slid = be16_to_cpu(hdr->lrh[3]); + pkey = (u16)be32_to_cpu(ohdr->bth[0]); + sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; + extra_bytes = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + extra_bytes += (SIZE_OF_CRC << 2); + sl_from_sc = ibp->sc_to_sl[sc5]; - /* - * The opcode is in the low byte when its in network order - * (top byte when in host order). - */ opcode = be32_to_cpu(ohdr->bth[0]) >> 24; opcode &= 0xff; - pkey = (u16)be32_to_cpu(ohdr->bth[0]); - - if (!is_mcast && (opcode != IB_OPCODE_CNP) && bth1 & HFI1_FECN_SMASK) { - u16 slid = be16_to_cpu(hdr->lrh[3]); - - return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh); - } + process_ecn(qp, packet, (opcode != IB_OPCODE_CNP)); /* * Get the number of bytes the message was padded by * and drop incomplete packets. */ - pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; - if (unlikely(tlen < (hdrsize + pad + 4))) + if (unlikely(tlen < (hdrsize + extra_bytes))) goto drop; - tlen -= hdrsize + pad + 4; + tlen -= hdrsize + extra_bytes; /* * Check that the permissive LID is only used on QP0 @@ -736,10 +720,6 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) hdr->lrh[3] == IB_LID_PERMISSIVE)) goto drop; if (qp->ibqp.qp_num > 1) { - struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - u16 slid; - - slid = be16_to_cpu(hdr->lrh[3]); if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) { /* * Traps will not be sent for packets dropped @@ -748,12 +728,9 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) * IB spec (release 1.3, section 10.9.4) */ hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY, - pkey, - (be16_to_cpu(hdr->lrh[0]) >> 4) & - 0xF, + pkey, sl, src_qp, qp->ibqp.qp_num, - be16_to_cpu(hdr->lrh[3]), - be16_to_cpu(hdr->lrh[1])); + slid, dlid); return; } } else { @@ -763,22 +740,18 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) goto drop; } if (unlikely(qkey != qp->qkey)) { - hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey, - (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, + hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey, sl, src_qp, qp->ibqp.qp_num, - be16_to_cpu(hdr->lrh[3]), - be16_to_cpu(hdr->lrh[1])); + slid, dlid); return; } /* Drop invalid MAD packets (see 13.5.3.1). */ if (unlikely(qp->ibqp.qp_num == 1 && - (tlen > 2048 || - (be16_to_cpu(hdr->lrh[0]) >> 12) == 15))) + (tlen > 2048 || (sc5 == 0xF)))) goto drop; } else { /* Received on QP0, and so by definition, this is an SMP */ struct opa_smp *smp = (struct opa_smp *)data; - u16 slid = be16_to_cpu(hdr->lrh[3]); if (opa_smp_check(ibp, pkey, sc5, qp, slid, smp)) goto drop; @@ -861,7 +834,6 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) qp->ibqp.qp_type == IB_QPT_SMI) { if (mgmt_pkey_idx < 0) { if (net_ratelimit()) { - struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); struct hfi1_devdata *dd = ppd->dd; dd_dev_err(dd, "QP type %d mgmt_pkey_idx < 0 and packet not dropped???\n", @@ -874,8 +846,8 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) wc.pkey_index = 0; } - wc.slid = be16_to_cpu(hdr->lrh[3]); - wc.sl = ibp->sc_to_sl[sc5]; + wc.slid = slid; + wc.sl = sl_from_sc; /* * Save the LMC lower bits if the destination LID is a unicast LID. diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 1b640a35b3fe..64d26525435a 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -82,24 +82,25 @@ struct tid_pageset { ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT)) static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *, - struct rb_root *); + struct hfi1_filedata *); static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *); static int set_rcvarray_entry(struct file *, unsigned long, u32, struct tid_group *, struct page **, unsigned); -static int mmu_rb_insert(struct rb_root *, struct mmu_rb_node *); -static void mmu_rb_remove(struct rb_root *, struct mmu_rb_node *, - struct mm_struct *); -static int mmu_rb_invalidate(struct rb_root *, struct mmu_rb_node *); +static int tid_rb_insert(void *, struct mmu_rb_node *); +static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, + struct tid_rb_node *tnode); +static void tid_rb_remove(void *, struct mmu_rb_node *); +static int tid_rb_invalidate(void *, struct mmu_rb_node *); static int program_rcvarray(struct file *, unsigned long, struct tid_group *, struct tid_pageset *, unsigned, u16, struct page **, u32 *, unsigned *, unsigned *); static int unprogram_rcvarray(struct file *, u32, struct tid_group **); -static void clear_tid_node(struct hfi1_filedata *, u16, struct tid_rb_node *); +static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node); static struct mmu_rb_ops tid_rb_ops = { - .insert = mmu_rb_insert, - .remove = mmu_rb_remove, - .invalidate = mmu_rb_invalidate + .insert = tid_rb_insert, + .remove = tid_rb_remove, + .invalidate = tid_rb_invalidate }; static inline u32 rcventry2tidinfo(u32 rcventry) @@ -162,7 +163,6 @@ int hfi1_user_exp_rcv_init(struct file *fp) spin_lock_init(&fd->tid_lock); spin_lock_init(&fd->invalid_lock); - fd->tid_rb_root = RB_ROOT; if (!uctxt->subctxt_cnt || !fd->subctxt) { exp_tid_group_init(&uctxt->tid_group_list); @@ -197,7 +197,7 @@ int hfi1_user_exp_rcv_init(struct file *fp) if (!fd->entry_to_rb) return -ENOMEM; - if (!HFI1_CAP_IS_USET(TID_UNMAP)) { + if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) { fd->invalid_tid_idx = 0; fd->invalid_tids = kzalloc(uctxt->expected_count * sizeof(u32), GFP_KERNEL); @@ -208,15 +208,15 @@ int hfi1_user_exp_rcv_init(struct file *fp) /* * Register MMU notifier callbacks. If the registration - * fails, continue but turn off the TID caching for - * all user contexts. + * fails, continue without TID caching for this context. */ - ret = hfi1_mmu_rb_register(&fd->tid_rb_root, &tid_rb_ops); + ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops, + dd->pport->hfi1_wq, + &fd->handler); if (ret) { dd_dev_info(dd, "Failed MMU notifier registration %d\n", ret); - HFI1_CAP_USET(TID_UNMAP); ret = 0; } } @@ -235,7 +235,7 @@ int hfi1_user_exp_rcv_init(struct file *fp) * init. */ spin_lock(&fd->tid_lock); - if (uctxt->subctxt_cnt && !HFI1_CAP_IS_USET(TID_UNMAP)) { + if (uctxt->subctxt_cnt && fd->handler) { u16 remainder; fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt; @@ -261,18 +261,16 @@ int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) * The notifier would have been removed when the process'es mm * was freed. */ - if (!HFI1_CAP_IS_USET(TID_UNMAP)) - hfi1_mmu_rb_unregister(&fd->tid_rb_root); + if (fd->handler) + hfi1_mmu_rb_unregister(fd->handler); kfree(fd->invalid_tids); if (!uctxt->cnt) { if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list)) - unlock_exp_tids(uctxt, &uctxt->tid_full_list, - &fd->tid_rb_root); + unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd); if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list)) - unlock_exp_tids(uctxt, &uctxt->tid_used_list, - &fd->tid_rb_root); + unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd); list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list, list) { list_del_init(&grp->list); @@ -399,12 +397,12 @@ int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo) * pages, accept the amount pinned so far and program only that. * User space knows how to deal with partially programmed buffers. */ - if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages)) { + if (!hfi1_can_pin_pages(dd, fd->mm, fd->tid_n_pinned, npages)) { ret = -ENOMEM; goto bail; } - pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages); + pinned = hfi1_acquire_user_pages(fd->mm, vaddr, npages, true, pages); if (pinned <= 0) { ret = pinned; goto bail; @@ -559,7 +557,7 @@ nomem: * for example), unpin all unmapped pages so we can pin them nex time. */ if (mapped_pages != pinned) { - hfi1_release_user_pages(current->mm, &pages[mapped_pages], + hfi1_release_user_pages(fd->mm, &pages[mapped_pages], pinned - mapped_pages, false); fd->tid_n_pinned -= pinned - mapped_pages; @@ -829,7 +827,6 @@ static int set_rcvarray_entry(struct file *fp, unsigned long vaddr, struct hfi1_ctxtdata *uctxt = fd->uctxt; struct tid_rb_node *node; struct hfi1_devdata *dd = uctxt->dd; - struct rb_root *root = &fd->tid_rb_root; dma_addr_t phys; /* @@ -861,10 +858,10 @@ static int set_rcvarray_entry(struct file *fp, unsigned long vaddr, node->freed = false; memcpy(node->pages, pages, sizeof(struct page *) * npages); - if (HFI1_CAP_IS_USET(TID_UNMAP)) - ret = mmu_rb_insert(root, &node->mmu); + if (!fd->handler) + ret = tid_rb_insert(fd, &node->mmu); else - ret = hfi1_mmu_rb_insert(root, &node->mmu); + ret = hfi1_mmu_rb_insert(fd->handler, &node->mmu); if (ret) { hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d", @@ -904,19 +901,19 @@ static int unprogram_rcvarray(struct file *fp, u32 tidinfo, node = fd->entry_to_rb[rcventry]; if (!node || node->rcventry != (uctxt->expected_base + rcventry)) return -EBADF; - if (HFI1_CAP_IS_USET(TID_UNMAP)) - mmu_rb_remove(&fd->tid_rb_root, &node->mmu, NULL); - else - hfi1_mmu_rb_remove(&fd->tid_rb_root, &node->mmu); if (grp) *grp = node->grp; - clear_tid_node(fd, fd->subctxt, node); + + if (!fd->handler) + cacheless_tid_rb_remove(fd, node); + else + hfi1_mmu_rb_remove(fd->handler, &node->mmu); + return 0; } -static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt, - struct tid_rb_node *node) +static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) { struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; @@ -934,7 +931,7 @@ static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt, pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len, PCI_DMA_FROMDEVICE); - hfi1_release_user_pages(current->mm, node->pages, node->npages, true); + hfi1_release_user_pages(fd->mm, node->pages, node->npages, true); fd->tid_n_pinned -= node->npages; node->grp->used--; @@ -949,12 +946,15 @@ static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt, kfree(node); } +/* + * As a simple helper for hfi1_user_exp_rcv_free, this function deals with + * clearing nodes in the non-cached case. + */ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, - struct exp_tid_set *set, struct rb_root *root) + struct exp_tid_set *set, + struct hfi1_filedata *fd) { struct tid_group *grp, *ptr; - struct hfi1_filedata *fd = container_of(root, struct hfi1_filedata, - tid_rb_root); int i; list_for_each_entry_safe(grp, ptr, &set->list, list) { @@ -969,22 +969,23 @@ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, uctxt->expected_base]; if (!node || node->rcventry != rcventry) continue; - if (HFI1_CAP_IS_USET(TID_UNMAP)) - mmu_rb_remove(&fd->tid_rb_root, - &node->mmu, NULL); - else - hfi1_mmu_rb_remove(&fd->tid_rb_root, - &node->mmu); - clear_tid_node(fd, -1, node); + + cacheless_tid_rb_remove(fd, node); } } } } -static int mmu_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode) +/* + * Always return 0 from this function. A non-zero return indicates that the + * remove operation will be called and that memory should be unpinned. + * However, the driver cannot unpin out from under PSM. Instead, retain the + * memory (by returning 0) and inform PSM that the memory is going away. PSM + * will call back later when it has removed the memory from its list. + */ +static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode) { - struct hfi1_filedata *fdata = - container_of(root, struct hfi1_filedata, tid_rb_root); + struct hfi1_filedata *fdata = arg; struct hfi1_ctxtdata *uctxt = fdata->uctxt; struct tid_rb_node *node = container_of(mnode, struct tid_rb_node, mmu); @@ -1025,10 +1026,9 @@ static int mmu_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode) return 0; } -static int mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *node) +static int tid_rb_insert(void *arg, struct mmu_rb_node *node) { - struct hfi1_filedata *fdata = - container_of(root, struct hfi1_filedata, tid_rb_root); + struct hfi1_filedata *fdata = arg; struct tid_rb_node *tnode = container_of(node, struct tid_rb_node, mmu); u32 base = fdata->uctxt->expected_base; @@ -1037,14 +1037,20 @@ static int mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *node) return 0; } -static void mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node, - struct mm_struct *mm) +static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, + struct tid_rb_node *tnode) { - struct hfi1_filedata *fdata = - container_of(root, struct hfi1_filedata, tid_rb_root); - struct tid_rb_node *tnode = - container_of(node, struct tid_rb_node, mmu); u32 base = fdata->uctxt->expected_base; fdata->entry_to_rb[tnode->rcventry - base] = NULL; + clear_tid_node(fdata, tnode); +} + +static void tid_rb_remove(void *arg, struct mmu_rb_node *node) +{ + struct hfi1_filedata *fdata = arg; + struct tid_rb_node *tnode = + container_of(node, struct tid_rb_node, mmu); + + cacheless_tid_rb_remove(fdata, tnode); } diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index 88e10b5f55f1..20f4ddcac3b0 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -68,7 +68,8 @@ MODULE_PARM_DESC(cache_size, "Send and receive side cache size limit (in MB)"); * could keeping caching buffers. * */ -bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages) +bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm, + u32 nlocked, u32 npages) { unsigned long ulimit = rlimit(RLIMIT_MEMLOCK), pinned, cache_limit, size = (cache_size * (1UL << 20)); /* convert to bytes */ @@ -89,9 +90,9 @@ bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages) /* Convert to number of pages */ size = DIV_ROUND_UP(size, PAGE_SIZE); - down_read(¤t->mm->mmap_sem); - pinned = current->mm->pinned_vm; - up_read(¤t->mm->mmap_sem); + down_read(&mm->mmap_sem); + pinned = mm->pinned_vm; + up_read(&mm->mmap_sem); /* First, check the absolute limit against all pinned pages. */ if (pinned + npages >= ulimit && !can_lock) @@ -100,8 +101,8 @@ bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages) return ((nlocked + npages) <= size) || can_lock; } -int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable, - struct page **pages) +int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t npages, + bool writable, struct page **pages) { int ret; @@ -109,9 +110,9 @@ int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable, if (ret < 0) return ret; - down_write(¤t->mm->mmap_sem); - current->mm->pinned_vm += ret; - up_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); + mm->pinned_vm += ret; + up_write(&mm->mmap_sem); return ret; } diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 47ffd273ecbd..0ecf27903dc2 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -145,7 +145,7 @@ MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 12 /* Last packet in the request */ #define TXREQ_FLAGS_REQ_LAST_PKT BIT(0) -#define SDMA_REQ_IN_USE 0 +/* SDMA request flag bits */ #define SDMA_REQ_FOR_THREAD 1 #define SDMA_REQ_SEND_DONE 2 #define SDMA_REQ_HAVE_AHG 3 @@ -183,16 +183,18 @@ struct user_sdma_iovec { struct sdma_mmu_node *node; }; -#define SDMA_CACHE_NODE_EVICT 0 - struct sdma_mmu_node { struct mmu_rb_node rb; - struct list_head list; struct hfi1_user_sdma_pkt_q *pq; atomic_t refcount; struct page **pages; unsigned npages; - unsigned long flags; +}; + +/* evict operation argument */ +struct evict_data { + u32 cleared; /* count evicted so far */ + u32 target; /* target count to evict */ }; struct user_sdma_request { @@ -305,14 +307,16 @@ static int defer_packet_queue( unsigned seq); static void activate_packet_queue(struct iowait *, int); static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long); -static int sdma_rb_insert(struct rb_root *, struct mmu_rb_node *); -static void sdma_rb_remove(struct rb_root *, struct mmu_rb_node *, - struct mm_struct *); -static int sdma_rb_invalidate(struct rb_root *, struct mmu_rb_node *); +static int sdma_rb_insert(void *, struct mmu_rb_node *); +static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, + void *arg2, bool *stop); +static void sdma_rb_remove(void *, struct mmu_rb_node *); +static int sdma_rb_invalidate(void *, struct mmu_rb_node *); static struct mmu_rb_ops sdma_rb_ops = { .filter = sdma_rb_filter, .insert = sdma_rb_insert, + .evict = sdma_rb_evict, .remove = sdma_rb_remove, .invalidate = sdma_rb_invalidate }; @@ -397,6 +401,11 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) if (!pq->reqs) goto pq_reqs_nomem; + memsize = BITS_TO_LONGS(hfi1_sdma_comp_ring_size) * sizeof(long); + pq->req_in_use = kzalloc(memsize, GFP_KERNEL); + if (!pq->req_in_use) + goto pq_reqs_no_in_use; + INIT_LIST_HEAD(&pq->list); pq->dd = dd; pq->ctxt = uctxt->ctxt; @@ -405,9 +414,8 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) pq->state = SDMA_PKT_Q_INACTIVE; atomic_set(&pq->n_reqs, 0); init_waitqueue_head(&pq->wait); - pq->sdma_rb_root = RB_ROOT; - INIT_LIST_HEAD(&pq->evict); - spin_lock_init(&pq->evict_lock); + atomic_set(&pq->n_locked, 0); + pq->mm = fd->mm; iowait_init(&pq->busy, 0, NULL, defer_packet_queue, activate_packet_queue, NULL); @@ -437,7 +445,8 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) cq->nentries = hfi1_sdma_comp_ring_size; fd->cq = cq; - ret = hfi1_mmu_rb_register(&pq->sdma_rb_root, &sdma_rb_ops); + ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq, + &pq->handler); if (ret) { dd_dev_err(dd, "Failed to register with MMU %d", ret); goto done; @@ -453,6 +462,8 @@ cq_comps_nomem: cq_nomem: kmem_cache_destroy(pq->txreq_cache); pq_txreq_nomem: + kfree(pq->req_in_use); +pq_reqs_no_in_use: kfree(pq->reqs); pq_reqs_nomem: kfree(pq); @@ -472,8 +483,9 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd) hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit, uctxt->ctxt, fd->subctxt); pq = fd->pq; - hfi1_mmu_rb_unregister(&pq->sdma_rb_root); if (pq) { + if (pq->handler) + hfi1_mmu_rb_unregister(pq->handler); spin_lock_irqsave(&uctxt->sdma_qlock, flags); if (!list_empty(&pq->list)) list_del_init(&pq->list); @@ -484,6 +496,7 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd) pq->wait, (ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE)); kfree(pq->reqs); + kfree(pq->req_in_use); kmem_cache_destroy(pq->txreq_cache); kfree(pq); fd->pq = NULL; @@ -496,10 +509,31 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd) return 0; } +static u8 dlid_to_selector(u16 dlid) +{ + static u8 mapping[256]; + static int initialized; + static u8 next; + int hash; + + if (!initialized) { + memset(mapping, 0xFF, 256); + initialized = 1; + } + + hash = ((dlid >> 8) ^ dlid) & 0xFF; + if (mapping[hash] == 0xFF) { + mapping[hash] = next; + next = (next + 1) & 0x7F; + } + + return mapping[hash]; +} + int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, unsigned long dim, unsigned long *count) { - int ret = 0, i = 0; + int ret = 0, i; struct hfi1_filedata *fd = fp->private_data; struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_user_sdma_pkt_q *pq = fd->pq; @@ -511,6 +545,8 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, struct user_sdma_request *req; u8 opcode, sc, vl; int req_queued = 0; + u16 dlid; + u8 selector; if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { hfi1_cdbg( @@ -529,30 +565,48 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, (u16 *)&info); - if (cq->comps[info.comp_idx].status == QUEUED || - test_bit(SDMA_REQ_IN_USE, &pq->reqs[info.comp_idx].flags)) { - hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state", - dd->unit, uctxt->ctxt, fd->subctxt, - info.comp_idx); - return -EBADSLT; + + if (info.comp_idx >= hfi1_sdma_comp_ring_size) { + hfi1_cdbg(SDMA, + "[%u:%u:%u:%u] Invalid comp index", + dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); + return -EINVAL; } + + /* + * Sanity check the header io vector count. Need at least 1 vector + * (header) and cannot be larger than the actual io vector count. + */ + if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { + hfi1_cdbg(SDMA, + "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", + dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, + req_iovcnt(info.ctrl), dim); + return -EINVAL; + } + if (!info.fragsize) { hfi1_cdbg(SDMA, "[%u:%u:%u:%u] Request does not specify fragsize", dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); return -EINVAL; } + + /* Try to claim the request. */ + if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { + hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", + dd->unit, uctxt->ctxt, fd->subctxt, + info.comp_idx); + return -EBADSLT; + } /* - * We've done all the safety checks that we can up to this point, - * "allocate" the request entry. + * All safety checks have been done and this request has been claimed. */ hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); req = pq->reqs + info.comp_idx; memset(req, 0, sizeof(*req)); - /* Mark the request as IN_USE before we start filling it in. */ - set_bit(SDMA_REQ_IN_USE, &req->flags); - req->data_iovs = req_iovcnt(info.ctrl) - 1; + req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ req->pq = pq; req->cq = cq; req->status = -1; @@ -560,13 +614,22 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, memcpy(&req->info, &info, sizeof(info)); - if (req_opcode(info.ctrl) == EXPECTED) + if (req_opcode(info.ctrl) == EXPECTED) { + /* expected must have a TID info and at least one data vector */ + if (req->data_iovs < 2) { + SDMA_DBG(req, + "Not enough vectors for expected request"); + ret = -EINVAL; + goto free_req; + } req->data_iovs--; + } if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, MAX_VECTORS_PER_REQ); - return -EINVAL; + ret = -EINVAL; + goto free_req; } /* Copy the header from the user buffer */ ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), @@ -634,7 +697,7 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, idx++; /* Save all the IO vector structures */ - while (i < req->data_iovs) { + for (i = 0; i < req->data_iovs; i++) { INIT_LIST_HEAD(&req->iovs[i].list); memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec)); ret = pin_vector_pages(req, &req->iovs[i]); @@ -642,7 +705,7 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, req->status = ret; goto free_req; } - req->data_len += req->iovs[i++].iov.iov_len; + req->data_len += req->iovs[i].iov.iov_len; } SDMA_DBG(req, "total data length %u", req->data_len); @@ -686,9 +749,13 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, idx++; } + dlid = be16_to_cpu(req->hdr.lrh[1]); + selector = dlid_to_selector(dlid); + /* Have to select the engine */ req->sde = sdma_select_engine_vl(dd, - (u32)(uctxt->ctxt + fd->subctxt), + (u32)(uctxt->ctxt + fd->subctxt + + selector), vl); if (!req->sde || !sdma_running(req->sde)) { ret = -ECOMM; @@ -766,14 +833,21 @@ static inline u32 compute_data_length(struct user_sdma_request *req, * The size of the data of the first packet is in the header * template. However, it includes the header and ICRC, which need * to be subtracted. + * The minimum representable packet data length in a header is 4 bytes, + * therefore, when the data length request is less than 4 bytes, there's + * only one packet, and the packet data length is equal to that of the + * request data length. * The size of the remaining packets is the minimum of the frag * size (MTU) or remaining data in the request. */ u32 len; if (!req->seqnum) { - len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - - (sizeof(tx->hdr) - 4)); + if (req->data_len < sizeof(u32)) + len = req->data_len; + else + len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - + (sizeof(tx->hdr) - 4)); } else if (req_opcode(req->info.ctrl) == EXPECTED) { u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * PAGE_SIZE; @@ -803,6 +877,13 @@ static inline u32 compute_data_length(struct user_sdma_request *req, return len; } +static inline u32 pad_len(u32 len) +{ + if (len & (sizeof(u32) - 1)) + len += sizeof(u32) - (len & (sizeof(u32) - 1)); + return len; +} + static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) { /* (Size of complete header - size of PBC) + 4B ICRC + data length */ @@ -894,7 +975,8 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) { if (!req->seqnum) { u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); - u32 lrhlen = get_lrh_len(req->hdr, datalen); + u32 lrhlen = get_lrh_len(req->hdr, + pad_len(datalen)); /* * Copy the request header into the tx header * because the HW needs a cacheline-aligned @@ -1048,39 +1130,24 @@ static inline int num_user_pages(const struct iovec *iov) static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) { - u32 cleared = 0; - struct sdma_mmu_node *node, *ptr; - struct list_head to_evict = LIST_HEAD_INIT(to_evict); - - spin_lock(&pq->evict_lock); - list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) { - /* Make sure that no one is still using the node. */ - if (!atomic_read(&node->refcount)) { - set_bit(SDMA_CACHE_NODE_EVICT, &node->flags); - list_del_init(&node->list); - list_add(&node->list, &to_evict); - cleared += node->npages; - if (cleared >= npages) - break; - } - } - spin_unlock(&pq->evict_lock); - - list_for_each_entry_safe(node, ptr, &to_evict, list) - hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb); + struct evict_data evict_data; - return cleared; + evict_data.cleared = 0; + evict_data.target = npages; + hfi1_mmu_rb_evict(pq->handler, &evict_data); + return evict_data.cleared; } static int pin_vector_pages(struct user_sdma_request *req, - struct user_sdma_iovec *iovec) { + struct user_sdma_iovec *iovec) +{ int ret = 0, pinned, npages, cleared; struct page **pages; struct hfi1_user_sdma_pkt_q *pq = req->pq; struct sdma_mmu_node *node = NULL; struct mmu_rb_node *rb_node; - rb_node = hfi1_mmu_rb_extract(&pq->sdma_rb_root, + rb_node = hfi1_mmu_rb_extract(pq->handler, (unsigned long)iovec->iov.iov_base, iovec->iov.iov_len); if (rb_node && !IS_ERR(rb_node)) @@ -1096,7 +1163,6 @@ static int pin_vector_pages(struct user_sdma_request *req, node->rb.addr = (unsigned long)iovec->iov.iov_base; node->pq = pq; atomic_set(&node->refcount, 0); - INIT_LIST_HEAD(&node->list); } npages = num_user_pages(&iovec->iov); @@ -1111,28 +1177,14 @@ static int pin_vector_pages(struct user_sdma_request *req, npages -= node->npages; - /* - * If rb_node is NULL, it means that this is brand new node - * and, therefore not on the eviction list. - * If, however, the rb_node is non-NULL, it means that the - * node is already in RB tree and, therefore on the eviction - * list (nodes are unconditionally inserted in the eviction - * list). In that case, we have to remove the node prior to - * calling the eviction function in order to prevent it from - * freeing this node. - */ - if (rb_node) { - spin_lock(&pq->evict_lock); - list_del_init(&node->list); - spin_unlock(&pq->evict_lock); - } retry: - if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) { + if (!hfi1_can_pin_pages(pq->dd, pq->mm, + atomic_read(&pq->n_locked), npages)) { cleared = sdma_cache_evict(pq, npages); if (cleared >= npages) goto retry; } - pinned = hfi1_acquire_user_pages( + pinned = hfi1_acquire_user_pages(pq->mm, ((unsigned long)iovec->iov.iov_base + (node->npages * PAGE_SIZE)), npages, 0, pages + node->npages); @@ -1142,7 +1194,7 @@ retry: goto bail; } if (pinned != npages) { - unpin_vector_pages(current->mm, pages, node->npages, + unpin_vector_pages(pq->mm, pages, node->npages, pinned); ret = -EFAULT; goto bail; @@ -1152,28 +1204,22 @@ retry: node->pages = pages; node->npages += pinned; npages = node->npages; - spin_lock(&pq->evict_lock); - list_add(&node->list, &pq->evict); - pq->n_locked += pinned; - spin_unlock(&pq->evict_lock); + atomic_add(pinned, &pq->n_locked); } iovec->pages = node->pages; iovec->npages = npages; iovec->node = node; - ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb); + ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb); if (ret) { - spin_lock(&pq->evict_lock); - if (!list_empty(&node->list)) - list_del(&node->list); - pq->n_locked -= node->npages; - spin_unlock(&pq->evict_lock); + atomic_sub(node->npages, &pq->n_locked); + iovec->node = NULL; goto bail; } return 0; bail: if (rb_node) - unpin_vector_pages(current->mm, node->pages, 0, node->npages); + unpin_vector_pages(pq->mm, node->pages, 0, node->npages); kfree(node); return ret; } @@ -1181,7 +1227,7 @@ bail: static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, unsigned start, unsigned npages) { - hfi1_release_user_pages(mm, pages + start, npages, 0); + hfi1_release_user_pages(mm, pages + start, npages, false); kfree(pages); } @@ -1192,16 +1238,14 @@ static int check_header_template(struct user_sdma_request *req, /* * Perform safety checks for any type of packet: * - transfer size is multiple of 64bytes - * - packet length is multiple of 4bytes - * - entire request length is multiple of 4bytes + * - packet length is multiple of 4 bytes * - packet length is not larger than MTU size * * These checks are only done for the first packet of the * transfer since the header is "given" to us by user space. * For the remainder of the packets we compute the values. */ - if (req->info.fragsize % PIO_BLOCK_SIZE || - lrhlen & 0x3 || req->data_len & 0x3 || + if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || lrhlen > get_lrh_len(*hdr, req->info.fragsize)) return -EINVAL; @@ -1263,7 +1307,7 @@ static int set_txreq_header(struct user_sdma_request *req, struct hfi1_pkt_header *hdr = &tx->hdr; u16 pbclen; int ret; - u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen); + u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); /* Copy the header template to the request before modification */ memcpy(hdr, &req->hdr, sizeof(*hdr)); @@ -1374,7 +1418,7 @@ static int set_txreq_header_ahg(struct user_sdma_request *req, struct hfi1_user_sdma_pkt_q *pq = req->pq; struct hfi1_pkt_header *hdr = &req->hdr; u16 pbclen = le16_to_cpu(hdr->pbc[0]); - u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len); + u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(len)); if (PBC2LRH(pbclen) != lrhlen) { /* PBC.PbcLengthDWs */ @@ -1534,14 +1578,14 @@ static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) continue; if (unpin) - hfi1_mmu_rb_remove(&req->pq->sdma_rb_root, + hfi1_mmu_rb_remove(req->pq->handler, &node->rb); else atomic_dec(&node->refcount); } } kfree(req->tids); - clear_bit(SDMA_REQ_IN_USE, &req->flags); + clear_bit(req->info.comp_idx, req->pq->req_in_use); } static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, @@ -1564,7 +1608,7 @@ static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, return (bool)(node->addr == addr); } -static int sdma_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode) +static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) { struct sdma_mmu_node *node = container_of(mnode, struct sdma_mmu_node, rb); @@ -1573,48 +1617,45 @@ static int sdma_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode) return 0; } -static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode, - struct mm_struct *mm) +/* + * Return 1 to remove the node from the rb tree and call the remove op. + * + * Called with the rb tree lock held. + */ +static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, + void *evict_arg, bool *stop) +{ + struct sdma_mmu_node *node = + container_of(mnode, struct sdma_mmu_node, rb); + struct evict_data *evict_data = evict_arg; + + /* is this node still being used? */ + if (atomic_read(&node->refcount)) + return 0; /* keep this node */ + + /* this node will be evicted, add its pages to our count */ + evict_data->cleared += node->npages; + + /* have enough pages been cleared? */ + if (evict_data->cleared >= evict_data->target) + *stop = true; + + return 1; /* remove this node */ +} + +static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) { struct sdma_mmu_node *node = container_of(mnode, struct sdma_mmu_node, rb); - spin_lock(&node->pq->evict_lock); - /* - * We've been called by the MMU notifier but this node has been - * scheduled for eviction. The eviction function will take care - * of freeing this node. - * We have to take the above lock first because we are racing - * against the setting of the bit in the eviction function. - */ - if (mm && test_bit(SDMA_CACHE_NODE_EVICT, &node->flags)) { - spin_unlock(&node->pq->evict_lock); - return; - } + atomic_sub(node->npages, &node->pq->n_locked); - if (!list_empty(&node->list)) - list_del(&node->list); - node->pq->n_locked -= node->npages; - spin_unlock(&node->pq->evict_lock); + unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages); - /* - * If mm is set, we are being called by the MMU notifier and we - * should not pass a mm_struct to unpin_vector_page(). This is to - * prevent a deadlock when hfi1_release_user_pages() attempts to - * take the mmap_sem, which the MMU notifier has already taken. - */ - unpin_vector_pages(mm ? NULL : current->mm, node->pages, 0, - node->npages); - /* - * If called by the MMU notifier, we have to adjust the pinned - * page count ourselves. - */ - if (mm) - mm->pinned_vm -= node->npages; kfree(node); } -static int sdma_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode) +static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) { struct sdma_mmu_node *node = container_of(mnode, struct sdma_mmu_node, rb); diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h index b9240e351161..39001714f551 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.h +++ b/drivers/infiniband/hw/hfi1/user_sdma.h @@ -63,14 +63,14 @@ struct hfi1_user_sdma_pkt_q { struct hfi1_devdata *dd; struct kmem_cache *txreq_cache; struct user_sdma_request *reqs; + unsigned long *req_in_use; struct iowait busy; unsigned state; wait_queue_head_t wait; unsigned long unpinned; - struct rb_root sdma_rb_root; - u32 n_locked; - struct list_head evict; - spinlock_t evict_lock; /* protect evict and n_locked */ + struct mmu_rb_handler *handler; + atomic_t n_locked; + struct mm_struct *mm; }; struct hfi1_user_sdma_comp_q { diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 849c4b9399d4..2b359540901d 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -306,7 +306,10 @@ const enum ib_wc_opcode ib_hfi1_wc_opcode[] = { [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, [IB_WR_RDMA_READ] = IB_WC_RDMA_READ, [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP, - [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD + [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD, + [IB_WR_SEND_WITH_INV] = IB_WC_SEND, + [IB_WR_LOCAL_INV] = IB_WC_LOCAL_INV, + [IB_WR_REG_MR] = IB_WC_REG_MR }; /* @@ -378,6 +381,8 @@ static const opcode_handler opcode_handler_tbl[256] = { [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = &hfi1_rc_rcv, [IB_OPCODE_RC_COMPARE_SWAP] = &hfi1_rc_rcv, [IB_OPCODE_RC_FETCH_ADD] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = &hfi1_rc_rcv, /* UC */ [IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv, [IB_OPCODE_UC_SEND_MIDDLE] = &hfi1_uc_rcv, @@ -540,19 +545,15 @@ void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release) /* * Make sure the QP is ready and able to accept the given opcode. */ -static inline int qp_ok(int opcode, struct hfi1_packet *packet) +static inline opcode_handler qp_ok(int opcode, struct hfi1_packet *packet) { - struct hfi1_ibport *ibp; - if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK)) - goto dropit; + return NULL; if (((opcode & RVT_OPCODE_QP_MASK) == packet->qp->allowed_ops) || (opcode == IB_OPCODE_CNP)) - return 1; -dropit: - ibp = &packet->rcd->ppd->ibport_data; - ibp->rvp.n_pkt_drops++; - return 0; + return opcode_handler_tbl[opcode]; + + return NULL; } /** @@ -571,6 +572,7 @@ void hfi1_ib_rcv(struct hfi1_packet *packet) struct hfi1_pportdata *ppd = rcd->ppd; struct hfi1_ibport *ibp = &ppd->ibport_data; struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi; + opcode_handler packet_handler; unsigned long flags; u32 qp_num; int lnh; @@ -616,8 +618,11 @@ void hfi1_ib_rcv(struct hfi1_packet *packet) list_for_each_entry_rcu(p, &mcast->qp_list, list) { packet->qp = p->qp; spin_lock_irqsave(&packet->qp->r_lock, flags); - if (likely((qp_ok(opcode, packet)))) - opcode_handler_tbl[opcode](packet); + packet_handler = qp_ok(opcode, packet); + if (likely(packet_handler)) + packet_handler(packet); + else + ibp->rvp.n_pkt_drops++; spin_unlock_irqrestore(&packet->qp->r_lock, flags); } /* @@ -634,8 +639,11 @@ void hfi1_ib_rcv(struct hfi1_packet *packet) goto drop; } spin_lock_irqsave(&packet->qp->r_lock, flags); - if (likely((qp_ok(opcode, packet)))) - opcode_handler_tbl[opcode](packet); + packet_handler = qp_ok(opcode, packet); + if (likely(packet_handler)) + packet_handler(packet); + else + ibp->rvp.n_pkt_drops++; spin_unlock_irqrestore(&packet->qp->r_lock, flags); rcu_read_unlock(); } @@ -808,19 +816,19 @@ static int build_verbs_tx_desc( struct rvt_sge_state *ss, u32 length, struct verbs_txreq *tx, - struct ahg_ib_header *ahdr, + struct hfi1_ahg_info *ahg_info, u64 pbc) { int ret = 0; - struct hfi1_pio_header *phdr = &tx->phdr; + struct hfi1_sdma_header *phdr = &tx->phdr; u16 hdrbytes = tx->hdr_dwords << 2; - if (!ahdr->ahgcount) { + if (!ahg_info->ahgcount) { ret = sdma_txinit_ahg( &tx->txreq, - ahdr->tx_flags, + ahg_info->tx_flags, hdrbytes + length, - ahdr->ahgidx, + ahg_info->ahgidx, 0, NULL, 0, @@ -838,11 +846,11 @@ static int build_verbs_tx_desc( } else { ret = sdma_txinit_ahg( &tx->txreq, - ahdr->tx_flags, + ahg_info->tx_flags, length, - ahdr->ahgidx, - ahdr->ahgcount, - ahdr->ahgdesc, + ahg_info->ahgidx, + ahg_info->ahgcount, + ahg_info->ahgdesc, hdrbytes, verbs_sdma_complete); if (ret) @@ -860,7 +868,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, u64 pbc) { struct hfi1_qp_priv *priv = qp->priv; - struct ahg_ib_header *ahdr = priv->s_hdr; + struct hfi1_ahg_info *ahg_info = priv->s_ahg; u32 hdrwords = qp->s_hdrwords; struct rvt_sge_state *ss = qp->s_cur_sge; u32 len = qp->s_cur_size; @@ -888,7 +896,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, plen); } tx->wqe = qp->s_wqe; - ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahdr, pbc); + ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahg_info, pbc); if (unlikely(ret)) goto bail_build; } @@ -1291,19 +1299,24 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) static void hfi1_fill_device_attr(struct hfi1_devdata *dd) { struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; + u16 ver = dd->dc8051_ver; memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props)); + rdi->dparms.props.fw_ver = ((u64)(dc8051_ver_maj(ver)) << 16) | + (u64)dc8051_ver_min(ver); rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | - IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE; + IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE | + IB_DEVICE_MEM_MGT_EXTENSIONS; rdi->dparms.props.page_size_cap = PAGE_SIZE; rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3; rdi->dparms.props.vendor_part_id = dd->pcidev->device; rdi->dparms.props.hw_ver = dd->minrev; rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid; - rdi->dparms.props.max_mr_size = ~0ULL; + rdi->dparms.props.max_mr_size = U64_MAX; + rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX; rdi->dparms.props.max_qp = hfi1_max_qps; rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs; rdi->dparms.props.max_sge = hfi1_max_sges; @@ -1567,6 +1580,17 @@ static void init_ibport(struct hfi1_pportdata *ppd) RCU_INIT_POINTER(ibp->rvp.qp[1], NULL); } +static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str, + size_t str_len) +{ + struct rvt_dev_info *rdi = ib_to_rvt(ibdev); + struct hfi1_ibdev *dev = dev_from_rdi(rdi); + u16 ver = dd_from_dev(dev)->dc8051_ver; + + snprintf(str, str_len, "%u.%u", dc8051_ver_maj(ver), + dc8051_ver_min(ver)); +} + /** * hfi1_register_ib_device - register our device with the infiniband core * @dd: the device data structure @@ -1613,6 +1637,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) /* keep process mad in the driver */ ibdev->process_mad = hfi1_process_mad; + ibdev->get_dev_fw_str = hfi1_get_dev_fw_str; strncpy(ibdev->node_desc, init_utsname()->nodename, sizeof(ibdev->node_desc)); @@ -1680,6 +1705,9 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.dparms.nports = dd->num_pports; dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd); + /* post send table */ + dd->verbs_dev.rdi.post_parms = hfi1_post_parms; + ppd = dd->pport; for (i = 0; i < dd->num_pports; i++, ppd++) rvt_init_port(&dd->verbs_dev.rdi, @@ -1730,8 +1758,7 @@ void hfi1_cnp_rcv(struct hfi1_packet *packet) struct rvt_qp *qp = packet->qp; u32 lqpn, rqpn = 0; u16 rlid = 0; - u8 sl, sc5, sc4_bit, svc_type; - bool sc4_set = has_sc4_bit(packet); + u8 sl, sc5, svc_type; switch (packet->qp->ibqp.qp_type) { case IB_QPT_UC: @@ -1754,9 +1781,7 @@ void hfi1_cnp_rcv(struct hfi1_packet *packet) return; } - sc4_bit = sc4_set << 4; - sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf; - sc5 |= sc4_bit; + sc5 = hdr2sc((struct hfi1_message_header *)hdr, packet->rhf); sl = ibp->sc_to_sl[sc5]; lqpn = qp->ibqp.qp_num; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 488356775627..d1b101c54828 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -178,16 +178,14 @@ struct hfi1_ib_header { } u; } __packed; -struct ahg_ib_header { - struct sdma_engine *sde; +struct hfi1_ahg_info { u32 ahgdesc[2]; u16 tx_flags; u8 ahgcount; u8 ahgidx; - struct hfi1_ib_header ibh; }; -struct hfi1_pio_header { +struct hfi1_sdma_header { __le64 pbc; struct hfi1_ib_header hdr; } __packed; @@ -197,7 +195,7 @@ struct hfi1_pio_header { * pair is made common */ struct hfi1_qp_priv { - struct ahg_ib_header *s_hdr; /* next header to send */ + struct hfi1_ahg_info *s_ahg; /* ahg info for next header */ struct sdma_engine *s_sde; /* current sde */ struct send_context *s_sendcontext; /* current sendcontext */ u8 s_sc; /* SC[0..4] for next packet */ diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h index a1d6e0807f97..5660897593ba 100644 --- a/drivers/infiniband/hw/hfi1/verbs_txreq.h +++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h @@ -56,7 +56,7 @@ #include "iowait.h" struct verbs_txreq { - struct hfi1_pio_header phdr; + struct hfi1_sdma_header phdr; struct sdma_txreq txreq; struct rvt_qp *qp; struct rvt_swqe *wqe; diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index d2fa72516960..5026dc79978a 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -1567,12 +1567,12 @@ static enum i40iw_status_code i40iw_del_multiple_qhash( ret = i40iw_manage_qhash(iwdev, cm_info, I40IW_QHASH_TYPE_TCP_SYN, I40IW_QHASH_MANAGE_TYPE_DELETE, NULL, false); - kfree(child_listen_node); - cm_parent_listen_node->cm_core->stats_listen_nodes_destroyed++; i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_CM, "freed pointer = %p\n", child_listen_node); + kfree(child_listen_node); + cm_parent_listen_node->cm_core->stats_listen_nodes_destroyed++; } spin_unlock_irqrestore(&iwdev->cm_core.listen_list_lock, flags); diff --git a/drivers/infiniband/hw/i40iw/i40iw_d.h b/drivers/infiniband/hw/i40iw/i40iw_d.h index bd942da91a27..2fac1db0e0a0 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_d.h +++ b/drivers/infiniband/hw/i40iw/i40iw_d.h @@ -1557,6 +1557,9 @@ enum i40iw_alignment { #define I40IW_RING_MOVE_TAIL(_ring) \ (_ring).tail = ((_ring).tail + 1) % (_ring).size +#define I40IW_RING_MOVE_HEAD_NOCHECK(_ring) \ + (_ring).head = ((_ring).head + 1) % (_ring).size + #define I40IW_RING_MOVE_TAIL_BY_COUNT(_ring, _count) \ (_ring).tail = ((_ring).tail + (_count)) % (_ring).size diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c b/drivers/infiniband/hw/i40iw/i40iw_puda.c index e9c6e82af9c7..c62d354f7810 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_puda.c +++ b/drivers/infiniband/hw/i40iw/i40iw_puda.c @@ -1025,6 +1025,8 @@ static void i40iw_ieq_compl_pfpdu(struct i40iw_puda_rsrc *ieq, u16 txoffset, bufoffset; buf = i40iw_puda_get_listbuf(pbufl); + if (!buf) + return; nextseqnum = buf->seqnum + fpdu_len; txbuf->totallen = buf->hdrlen + fpdu_len; txbuf->data = (u8 *)txbuf->mem.va + buf->hdrlen; @@ -1048,6 +1050,8 @@ static void i40iw_ieq_compl_pfpdu(struct i40iw_puda_rsrc *ieq, fpdu_len -= buf->datalen; i40iw_puda_ret_bufpool(ieq, buf); buf = i40iw_puda_get_listbuf(pbufl); + if (!buf) + return; bufoffset = (u16)(buf->data - (u8 *)buf->mem.va); } while (1); diff --git a/drivers/infiniband/hw/i40iw/i40iw_type.h b/drivers/infiniband/hw/i40iw/i40iw_type.h index 16cc61720b53..2b1a04e9ca3c 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_type.h +++ b/drivers/infiniband/hw/i40iw/i40iw_type.h @@ -667,7 +667,7 @@ struct i40iw_tcp_offload_info { bool time_stamp; u8 cwnd_inc_limit; bool drop_ooo_seg; - bool dup_ack_thresh; + u8 dup_ack_thresh; u8 ttl; u8 src_mac_addr_idx; bool avoid_stretch_ack; diff --git a/drivers/infiniband/hw/i40iw/i40iw_uk.c b/drivers/infiniband/hw/i40iw/i40iw_uk.c index e35faea88c13..4d28c3cb03cc 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_uk.c +++ b/drivers/infiniband/hw/i40iw/i40iw_uk.c @@ -291,9 +291,9 @@ static enum i40iw_status_code i40iw_rdma_write(struct i40iw_qp_uk *qp, i40iw_set_fragment(wqe, 0, op_info->lo_sg_list); - for (i = 1; i < op_info->num_lo_sges; i++) { - byte_off = 32 + (i - 1) * 16; + for (i = 1, byte_off = 32; i < op_info->num_lo_sges; i++) { i40iw_set_fragment(wqe, byte_off, &op_info->lo_sg_list[i]); + byte_off += 16; } wmb(); /* make sure WQE is populated before valid bit is set */ @@ -401,9 +401,9 @@ static enum i40iw_status_code i40iw_send(struct i40iw_qp_uk *qp, i40iw_set_fragment(wqe, 0, op_info->sg_list); - for (i = 1; i < op_info->num_sges; i++) { - byte_off = 32 + (i - 1) * 16; + for (i = 1, byte_off = 32; i < op_info->num_sges; i++) { i40iw_set_fragment(wqe, byte_off, &op_info->sg_list[i]); + byte_off += 16; } wmb(); /* make sure WQE is populated before valid bit is set */ @@ -685,9 +685,9 @@ static enum i40iw_status_code i40iw_post_receive(struct i40iw_qp_uk *qp, i40iw_set_fragment(wqe, 0, info->sg_list); - for (i = 1; i < info->num_sges; i++) { - byte_off = 32 + (i - 1) * 16; + for (i = 1, byte_off = 32; i < info->num_sges; i++) { i40iw_set_fragment(wqe, byte_off, &info->sg_list[i]); + byte_off += 16; } wmb(); /* make sure WQE is populated before valid bit is set */ @@ -753,8 +753,7 @@ static enum i40iw_status_code i40iw_cq_post_entries(struct i40iw_cq_uk *cq, * @post_cq: update cq tail */ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq, - struct i40iw_cq_poll_info *info, - bool post_cq) + struct i40iw_cq_poll_info *info) { u64 comp_ctx, qword0, qword2, qword3, wqe_qword; u64 *cqe, *sw_wqe; @@ -762,7 +761,6 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq, struct i40iw_ring *pring = NULL; u32 wqe_idx, q_type, array_idx = 0; enum i40iw_status_code ret_code = 0; - enum i40iw_status_code ret_code2 = 0; bool move_cq_head = true; u8 polarity; u8 addl_wqes = 0; @@ -870,19 +868,14 @@ exit: move_cq_head = false; if (move_cq_head) { - I40IW_RING_MOVE_HEAD(cq->cq_ring, ret_code2); - - if (ret_code2 && !ret_code) - ret_code = ret_code2; + I40IW_RING_MOVE_HEAD_NOCHECK(cq->cq_ring); if (I40IW_RING_GETCURRENT_HEAD(cq->cq_ring) == 0) cq->polarity ^= 1; - if (post_cq) { - I40IW_RING_MOVE_TAIL(cq->cq_ring); - set_64bit_val(cq->shadow_area, 0, - I40IW_RING_GETCURRENT_HEAD(cq->cq_ring)); - } + I40IW_RING_MOVE_TAIL(cq->cq_ring); + set_64bit_val(cq->shadow_area, 0, + I40IW_RING_GETCURRENT_HEAD(cq->cq_ring)); } else { if (info->is_srq) return ret_code; diff --git a/drivers/infiniband/hw/i40iw/i40iw_user.h b/drivers/infiniband/hw/i40iw/i40iw_user.h index 4627646fe8cd..276bcefffd7e 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_user.h +++ b/drivers/infiniband/hw/i40iw/i40iw_user.h @@ -327,7 +327,7 @@ struct i40iw_cq_ops { void (*iw_cq_request_notification)(struct i40iw_cq_uk *, enum i40iw_completion_notify); enum i40iw_status_code (*iw_cq_poll_completion)(struct i40iw_cq_uk *, - struct i40iw_cq_poll_info *, bool); + struct i40iw_cq_poll_info *); enum i40iw_status_code (*iw_cq_post_entries)(struct i40iw_cq_uk *, u8 count); void (*iw_cq_clean)(void *, struct i40iw_cq_uk *); }; diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 283b64c942ee..2360338877bf 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -529,7 +529,7 @@ static int i40iw_setup_kmode_qp(struct i40iw_device *iwdev, status = i40iw_get_wqe_shift(rq_size, ukinfo->max_rq_frag_cnt, 0, &rqshift); if (status) - return -ENOSYS; + return -ENOMEM; sqdepth = sq_size << sqshift; rqdepth = rq_size << rqshift; @@ -671,7 +671,7 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd, iwqp->ctx_info.qp_compl_ctx = (uintptr_t)qp; if (init_attr->qp_type != IB_QPT_RC) { - err_code = -ENOSYS; + err_code = -EINVAL; goto error; } if (iwdev->push_mode) @@ -1840,6 +1840,7 @@ struct ib_mr *i40iw_reg_phys_mr(struct ib_pd *pd, iwmr->ibmr.lkey = stag; iwmr->page_cnt = 1; iwmr->pgaddrmem[0] = addr; + iwmr->length = size; status = i40iw_hwreg_mr(iwdev, iwmr, access); if (status) { i40iw_free_stag(iwdev, stag); @@ -1863,7 +1864,7 @@ static struct ib_mr *i40iw_get_dma_mr(struct ib_pd *pd, int acc) { u64 kva = 0; - return i40iw_reg_phys_mr(pd, 0, 0xffffffffffULL, acc, &kva); + return i40iw_reg_phys_mr(pd, 0, 0, acc, &kva); } /** @@ -1975,18 +1976,6 @@ static ssize_t i40iw_show_rev(struct device *dev, } /** - * i40iw_show_fw_ver - */ -static ssize_t i40iw_show_fw_ver(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u32 firmware_version = I40IW_FW_VERSION; - - return sprintf(buf, "%u.%u\n", firmware_version, - (firmware_version & 0x000000ff)); -} - -/** * i40iw_show_hca */ static ssize_t i40iw_show_hca(struct device *dev, @@ -2006,13 +1995,11 @@ static ssize_t i40iw_show_board(struct device *dev, } static DEVICE_ATTR(hw_rev, S_IRUGO, i40iw_show_rev, NULL); -static DEVICE_ATTR(fw_ver, S_IRUGO, i40iw_show_fw_ver, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, i40iw_show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, i40iw_show_board, NULL); static struct device_attribute *i40iw_dev_attributes[] = { &dev_attr_hw_rev, - &dev_attr_fw_ver, &dev_attr_hca_type, &dev_attr_board_id }; @@ -2091,8 +2078,12 @@ static int i40iw_post_send(struct ib_qp *ibqp, ret = ukqp->ops.iw_send(ukqp, &info, ib_wr->ex.invalidate_rkey, false); } - if (ret) - err = -EIO; + if (ret) { + if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED) + err = -ENOMEM; + else + err = -EINVAL; + } break; case IB_WR_RDMA_WRITE: info.op_type = I40IW_OP_TYPE_RDMA_WRITE; @@ -2113,8 +2104,12 @@ static int i40iw_post_send(struct ib_qp *ibqp, ret = ukqp->ops.iw_rdma_write(ukqp, &info, false); } - if (ret) - err = -EIO; + if (ret) { + if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED) + err = -ENOMEM; + else + err = -EINVAL; + } break; case IB_WR_RDMA_READ_WITH_INV: inv_stag = true; @@ -2132,15 +2127,19 @@ static int i40iw_post_send(struct ib_qp *ibqp, info.op.rdma_read.lo_addr.stag = ib_wr->sg_list->lkey; info.op.rdma_read.lo_addr.len = ib_wr->sg_list->length; ret = ukqp->ops.iw_rdma_read(ukqp, &info, inv_stag, false); - if (ret) - err = -EIO; + if (ret) { + if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED) + err = -ENOMEM; + else + err = -EINVAL; + } break; case IB_WR_LOCAL_INV: info.op_type = I40IW_OP_TYPE_INV_STAG; info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey; ret = ukqp->ops.iw_stag_local_invalidate(ukqp, &info, true); if (ret) - err = -EIO; + err = -ENOMEM; break; case IB_WR_REG_MR: { @@ -2174,7 +2173,7 @@ static int i40iw_post_send(struct ib_qp *ibqp, ret = dev->iw_priv_qp_ops->iw_mr_fast_register(&iwqp->sc_qp, &info, true); if (ret) - err = -EIO; + err = -ENOMEM; break; } default: @@ -2214,6 +2213,7 @@ static int i40iw_post_recv(struct ib_qp *ibqp, struct i40iw_sge sg_list[I40IW_MAX_WQ_FRAGMENT_COUNT]; enum i40iw_status_code ret = 0; unsigned long flags; + int err = 0; iwqp = (struct i40iw_qp *)ibqp; ukqp = &iwqp->sc_qp.qp_uk; @@ -2228,6 +2228,10 @@ static int i40iw_post_recv(struct ib_qp *ibqp, ret = ukqp->ops.iw_post_receive(ukqp, &post_recv); if (ret) { i40iw_pr_err(" post_recv err %d\n", ret); + if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED) + err = -ENOMEM; + else + err = -EINVAL; *bad_wr = ib_wr; goto out; } @@ -2235,9 +2239,7 @@ static int i40iw_post_recv(struct ib_qp *ibqp, } out: spin_unlock_irqrestore(&iwqp->lock, flags); - if (ret) - return -ENOSYS; - return 0; + return err; } /** @@ -2264,7 +2266,7 @@ static int i40iw_poll_cq(struct ib_cq *ibcq, spin_lock_irqsave(&iwcq->lock, flags); while (cqe_count < num_entries) { - ret = ukcq->ops.iw_cq_poll_completion(ukcq, &cq_poll_info, true); + ret = ukcq->ops.iw_cq_poll_completion(ukcq, &cq_poll_info); if (ret == I40IW_ERR_QUEUE_EMPTY) { break; } else if (ret == I40IW_ERR_QUEUE_DESTROYED) { @@ -2437,6 +2439,15 @@ static const char * const i40iw_hw_stat_names[] = { "iwRdmaInv" }; +static void i40iw_get_dev_fw_str(struct ib_device *dev, char *str, + size_t str_len) +{ + u32 firmware_version = I40IW_FW_VERSION; + + snprintf(str, str_len, "%u.%u", firmware_version, + (firmware_version & 0x000000ff)); +} + /** * i40iw_alloc_hw_stats - Allocate a hw stats structure * @ibdev: device pointer from stack @@ -2528,7 +2539,7 @@ static int i40iw_modify_port(struct ib_device *ibdev, int port_modify_mask, struct ib_port_modify *props) { - return 0; + return -ENOSYS; } /** @@ -2660,6 +2671,7 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev memcpy(iwibdev->ibdev.iwcm->ifname, netdev->name, sizeof(iwibdev->ibdev.iwcm->ifname)); iwibdev->ibdev.get_port_immutable = i40iw_port_immutable; + iwibdev->ibdev.get_dev_fw_str = i40iw_get_dev_fw_str; iwibdev->ibdev.poll_cq = i40iw_poll_cq; iwibdev->ibdev.req_notify_cq = i40iw_req_notify_cq; iwibdev->ibdev.post_send = i40iw_post_send; @@ -2723,7 +2735,7 @@ int i40iw_register_rdma_device(struct i40iw_device *iwdev) iwdev->iwibdev = i40iw_init_rdma_device(iwdev); if (!iwdev->iwibdev) - return -ENOSYS; + return -ENOMEM; iwibdev = iwdev->iwibdev; ret = ib_register_device(&iwibdev->ibdev, NULL); @@ -2748,5 +2760,5 @@ error: kfree(iwdev->iwibdev->ibdev.iwcm); iwdev->iwibdev->ibdev.iwcm = NULL; ib_dealloc_device(&iwdev->iwibdev->ibdev); - return -ENOSYS; + return ret; } diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 9f8b516eb2b0..d6fc8a6e8c33 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -288,7 +288,7 @@ static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq, if (cq->resize_buf) return -EBUSY; - cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC); + cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_KERNEL); if (!cq->resize_buf) return -ENOMEM; @@ -316,7 +316,7 @@ static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) return -EFAULT; - cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC); + cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_KERNEL); if (!cq->resize_buf) return -ENOMEM; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 42a46078d7d5..2af44c2de262 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2025,16 +2025,6 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr, return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device); } -static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, - char *buf) -{ - struct mlx4_ib_dev *dev = - container_of(device, struct mlx4_ib_dev, ib_dev.dev); - return sprintf(buf, "%d.%d.%d\n", (int) (dev->dev->caps.fw_ver >> 32), - (int) (dev->dev->caps.fw_ver >> 16) & 0xffff, - (int) dev->dev->caps.fw_ver & 0xffff); -} - static ssize_t show_rev(struct device *device, struct device_attribute *attr, char *buf) { @@ -2053,17 +2043,204 @@ static ssize_t show_board(struct device *device, struct device_attribute *attr, } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static struct device_attribute *mlx4_class_attributes[] = { &dev_attr_hw_rev, - &dev_attr_fw_ver, &dev_attr_hca_type, &dev_attr_board_id }; +struct diag_counter { + const char *name; + u32 offset; +}; + +#define DIAG_COUNTER(_name, _offset) \ + { .name = #_name, .offset = _offset } + +static const struct diag_counter diag_basic[] = { + DIAG_COUNTER(rq_num_lle, 0x00), + DIAG_COUNTER(sq_num_lle, 0x04), + DIAG_COUNTER(rq_num_lqpoe, 0x08), + DIAG_COUNTER(sq_num_lqpoe, 0x0C), + DIAG_COUNTER(rq_num_lpe, 0x18), + DIAG_COUNTER(sq_num_lpe, 0x1C), + DIAG_COUNTER(rq_num_wrfe, 0x20), + DIAG_COUNTER(sq_num_wrfe, 0x24), + DIAG_COUNTER(sq_num_mwbe, 0x2C), + DIAG_COUNTER(sq_num_bre, 0x34), + DIAG_COUNTER(sq_num_rire, 0x44), + DIAG_COUNTER(rq_num_rire, 0x48), + DIAG_COUNTER(sq_num_rae, 0x4C), + DIAG_COUNTER(rq_num_rae, 0x50), + DIAG_COUNTER(sq_num_roe, 0x54), + DIAG_COUNTER(sq_num_tree, 0x5C), + DIAG_COUNTER(sq_num_rree, 0x64), + DIAG_COUNTER(rq_num_rnr, 0x68), + DIAG_COUNTER(sq_num_rnr, 0x6C), + DIAG_COUNTER(rq_num_oos, 0x100), + DIAG_COUNTER(sq_num_oos, 0x104), +}; + +static const struct diag_counter diag_ext[] = { + DIAG_COUNTER(rq_num_dup, 0x130), + DIAG_COUNTER(sq_num_to, 0x134), +}; + +static const struct diag_counter diag_device_only[] = { + DIAG_COUNTER(num_cqovf, 0x1A0), + DIAG_COUNTER(rq_num_udsdprd, 0x118), +}; + +static struct rdma_hw_stats *mlx4_ib_alloc_hw_stats(struct ib_device *ibdev, + u8 port_num) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct mlx4_ib_diag_counters *diag = dev->diag_counters; + + if (!diag[!!port_num].name) + return NULL; + + return rdma_alloc_hw_stats_struct(diag[!!port_num].name, + diag[!!port_num].num_counters, + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + +static int mlx4_ib_get_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + u8 port, int index) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct mlx4_ib_diag_counters *diag = dev->diag_counters; + u32 hw_value[ARRAY_SIZE(diag_device_only) + + ARRAY_SIZE(diag_ext) + ARRAY_SIZE(diag_basic)] = {}; + int ret; + int i; + + ret = mlx4_query_diag_counters(dev->dev, + MLX4_OP_MOD_QUERY_TRANSPORT_CI_ERRORS, + diag[!!port].offset, hw_value, + diag[!!port].num_counters, port); + + if (ret) + return ret; + + for (i = 0; i < diag[!!port].num_counters; i++) + stats->value[i] = hw_value[i]; + + return diag[!!port].num_counters; +} + +static int __mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev, + const char ***name, + u32 **offset, + u32 *num, + bool port) +{ + u32 num_counters; + + num_counters = ARRAY_SIZE(diag_basic); + + if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT) + num_counters += ARRAY_SIZE(diag_ext); + + if (!port) + num_counters += ARRAY_SIZE(diag_device_only); + + *name = kcalloc(num_counters, sizeof(**name), GFP_KERNEL); + if (!*name) + return -ENOMEM; + + *offset = kcalloc(num_counters, sizeof(**offset), GFP_KERNEL); + if (!*offset) + goto err_name; + + *num = num_counters; + + return 0; + +err_name: + kfree(*name); + return -ENOMEM; +} + +static void mlx4_ib_fill_diag_counters(struct mlx4_ib_dev *ibdev, + const char **name, + u32 *offset, + bool port) +{ + int i; + int j; + + for (i = 0, j = 0; i < ARRAY_SIZE(diag_basic); i++, j++) { + name[i] = diag_basic[i].name; + offset[i] = diag_basic[i].offset; + } + + if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT) { + for (i = 0; i < ARRAY_SIZE(diag_ext); i++, j++) { + name[j] = diag_ext[i].name; + offset[j] = diag_ext[i].offset; + } + } + + if (!port) { + for (i = 0; i < ARRAY_SIZE(diag_device_only); i++, j++) { + name[j] = diag_device_only[i].name; + offset[j] = diag_device_only[i].offset; + } + } +} + +static int mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev) +{ + struct mlx4_ib_diag_counters *diag = ibdev->diag_counters; + int i; + int ret; + bool per_port = !!(ibdev->dev->caps.flags2 & + MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT); + + for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) { + /* i == 1 means we are building port counters */ + if (i && !per_port) + continue; + + ret = __mlx4_ib_alloc_diag_counters(ibdev, &diag[i].name, + &diag[i].offset, + &diag[i].num_counters, i); + if (ret) + goto err_alloc; + + mlx4_ib_fill_diag_counters(ibdev, diag[i].name, + diag[i].offset, i); + } + + ibdev->ib_dev.get_hw_stats = mlx4_ib_get_hw_stats; + ibdev->ib_dev.alloc_hw_stats = mlx4_ib_alloc_hw_stats; + + return 0; + +err_alloc: + if (i) { + kfree(diag[i - 1].name); + kfree(diag[i - 1].offset); + } + + return ret; +} + +static void mlx4_ib_diag_cleanup(struct mlx4_ib_dev *ibdev) +{ + int i; + + for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) { + kfree(ibdev->diag_counters[i].offset); + kfree(ibdev->diag_counters[i].name); + } +} + #define MLX4_IB_INVALID_MAC ((u64)-1) static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev, struct net_device *dev, @@ -2280,6 +2457,17 @@ static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } +static void get_fw_ver_str(struct ib_device *device, char *str, + size_t str_len) +{ + struct mlx4_ib_dev *dev = + container_of(device, struct mlx4_ib_dev, ib_dev); + snprintf(str, str_len, "%d.%d.%d", + (int) (dev->dev->caps.fw_ver >> 32), + (int) (dev->dev->caps.fw_ver >> 16) & 0xffff, + (int) dev->dev->caps.fw_ver & 0xffff); +} + static void *mlx4_ib_add(struct mlx4_dev *dev) { struct mlx4_ib_dev *ibdev; @@ -2413,6 +2601,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach; ibdev->ib_dev.process_mad = mlx4_ib_process_mad; ibdev->ib_dev.get_port_immutable = mlx4_port_immutable; + ibdev->ib_dev.get_dev_fw_str = get_fw_ver_str; ibdev->ib_dev.disassociate_ucontext = mlx4_ib_disassociate_ucontext; if (!mlx4_is_slave(ibdev->dev)) { @@ -2555,9 +2744,12 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) for (j = 1; j <= ibdev->dev->caps.num_ports; j++) atomic64_set(&iboe->mac[j - 1], ibdev->dev->caps.def_mac[j]); - if (ib_register_device(&ibdev->ib_dev, NULL)) + if (mlx4_ib_alloc_diag_counters(ibdev)) goto err_steer_free_bitmap; + if (ib_register_device(&ibdev->ib_dev, NULL)) + goto err_diag_counters; + if (mlx4_ib_mad_init(ibdev)) goto err_reg; @@ -2623,6 +2815,9 @@ err_mad: err_reg: ib_unregister_device(&ibdev->ib_dev); +err_diag_counters: + mlx4_ib_diag_cleanup(ibdev); + err_steer_free_bitmap: kfree(ibdev->ib_uc_qpns_bitmap); @@ -2726,6 +2921,7 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) mlx4_ib_close_sriov(ibdev); mlx4_ib_mad_cleanup(ibdev); ib_unregister_device(&ibdev->ib_dev); + mlx4_ib_diag_cleanup(ibdev); if (ibdev->iboe.nb.notifier_call) { if (unregister_netdevice_notifier(&ibdev->iboe.nb)) pr_warn("failure unregistering notifier\n"); diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 29acda249612..7c5832ede4bd 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -549,6 +549,14 @@ struct mlx4_ib_counters { u32 default_counter; }; +#define MLX4_DIAG_COUNTERS_TYPES 2 + +struct mlx4_ib_diag_counters { + const char **name; + u32 *offset; + u32 num_counters; +}; + struct mlx4_ib_dev { struct ib_device ib_dev; struct mlx4_dev *dev; @@ -585,6 +593,7 @@ struct mlx4_ib_dev { /* protect resources needed as part of reset flow */ spinlock_t reset_flow_resource_lock; struct list_head qp_list; + struct mlx4_ib_diag_counters diag_counters[MLX4_DIAG_COUNTERS_TYPES]; }; struct ib_event_work { diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 9c0e67bd2ba7..308a358e5b46 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -424,6 +424,83 @@ static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe, item->key = be32_to_cpu(cqe->mkey); } +static void sw_send_comp(struct mlx5_ib_qp *qp, int num_entries, + struct ib_wc *wc, int *npolled) +{ + struct mlx5_ib_wq *wq; + unsigned int cur; + unsigned int idx; + int np; + int i; + + wq = &qp->sq; + cur = wq->head - wq->tail; + np = *npolled; + + if (cur == 0) + return; + + for (i = 0; i < cur && np < num_entries; i++) { + idx = wq->last_poll & (wq->wqe_cnt - 1); + wc->wr_id = wq->wrid[idx]; + wc->status = IB_WC_WR_FLUSH_ERR; + wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR; + wq->tail++; + np++; + wc->qp = &qp->ibqp; + wc++; + wq->last_poll = wq->w_list[idx].next; + } + *npolled = np; +} + +static void sw_recv_comp(struct mlx5_ib_qp *qp, int num_entries, + struct ib_wc *wc, int *npolled) +{ + struct mlx5_ib_wq *wq; + unsigned int cur; + int np; + int i; + + wq = &qp->rq; + cur = wq->head - wq->tail; + np = *npolled; + + if (cur == 0) + return; + + for (i = 0; i < cur && np < num_entries; i++) { + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + wc->status = IB_WC_WR_FLUSH_ERR; + wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR; + wq->tail++; + np++; + wc->qp = &qp->ibqp; + wc++; + } + *npolled = np; +} + +static void mlx5_ib_poll_sw_comp(struct mlx5_ib_cq *cq, int num_entries, + struct ib_wc *wc, int *npolled) +{ + struct mlx5_ib_qp *qp; + + *npolled = 0; + /* Find uncompleted WQEs belonging to that cq and retrun mmics ones */ + list_for_each_entry(qp, &cq->list_send_qp, cq_send_list) { + sw_send_comp(qp, num_entries, wc + *npolled, npolled); + if (*npolled >= num_entries) + return; + } + + list_for_each_entry(qp, &cq->list_recv_qp, cq_recv_list) { + sw_recv_comp(qp, num_entries, wc + *npolled, npolled); + if (*npolled >= num_entries) + return; + } +} + static int mlx5_poll_one(struct mlx5_ib_cq *cq, struct mlx5_ib_qp **cur_qp, struct ib_wc *wc) @@ -594,12 +671,18 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) { struct mlx5_ib_cq *cq = to_mcq(ibcq); struct mlx5_ib_qp *cur_qp = NULL; + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_core_dev *mdev = dev->mdev; unsigned long flags; int soft_polled = 0; int npolled; int err = 0; spin_lock_irqsave(&cq->lock, flags); + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + mlx5_ib_poll_sw_comp(cq, num_entries, wc, &npolled); + goto out; + } if (unlikely(!list_empty(&cq->wc_list))) soft_polled = poll_soft_wc(cq, num_entries, wc); @@ -612,7 +695,7 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) if (npolled) mlx5_cq_set_ci(&cq->mcq); - +out: spin_unlock_irqrestore(&cq->lock, flags); if (err == 0 || err == -EAGAIN) @@ -843,6 +926,8 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, cq->resize_buf = NULL; cq->resize_umem = NULL; cq->create_flags = attr->flags; + INIT_LIST_HEAD(&cq->list_send_qp); + INIT_LIST_HEAD(&cq->list_recv_qp); if (context) { err = create_cq_user(dev, udata, context, cq, entries, diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c index 53e03c8ede79..79e6309460dc 100644 --- a/drivers/infiniband/hw/mlx5/gsi.c +++ b/drivers/infiniband/hw/mlx5/gsi.c @@ -69,15 +69,6 @@ static bool mlx5_ib_deth_sqpn_cap(struct mlx5_ib_dev *dev) return MLX5_CAP_GEN(dev->mdev, set_deth_sqpn); } -static u32 next_outstanding(struct mlx5_ib_gsi_qp *gsi, u32 index) -{ - return ++index % gsi->cap.max_send_wr; -} - -#define for_each_outstanding_wr(gsi, index) \ - for (index = gsi->outstanding_ci; index != gsi->outstanding_pi; \ - index = next_outstanding(gsi, index)) - /* Call with gsi->lock locked */ static void generate_completions(struct mlx5_ib_gsi_qp *gsi) { @@ -85,8 +76,9 @@ static void generate_completions(struct mlx5_ib_gsi_qp *gsi) struct mlx5_ib_gsi_wr *wr; u32 index; - for_each_outstanding_wr(gsi, index) { - wr = &gsi->outstanding_wrs[index]; + for (index = gsi->outstanding_ci; index != gsi->outstanding_pi; + index++) { + wr = &gsi->outstanding_wrs[index % gsi->cap.max_send_wr]; if (!wr->completed) break; @@ -430,8 +422,9 @@ static int mlx5_ib_add_outstanding_wr(struct mlx5_ib_gsi_qp *gsi, return -ENOMEM; } - gsi_wr = &gsi->outstanding_wrs[gsi->outstanding_pi]; - gsi->outstanding_pi = next_outstanding(gsi, gsi->outstanding_pi); + gsi_wr = &gsi->outstanding_wrs[gsi->outstanding_pi % + gsi->cap.max_send_wr]; + gsi->outstanding_pi++; if (!wc) { memset(&gsi_wr->wc, 0, sizeof(gsi_wr->wc)); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index dad63f038bb8..a84bb766fc62 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -42,11 +42,13 @@ #include <asm/pat.h> #endif #include <linux/sched.h> +#include <linux/delay.h> #include <rdma/ib_user_verbs.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> #include <linux/mlx5/port.h> #include <linux/mlx5/vport.h> +#include <linux/list.h> #include <rdma/ib_smi.h> #include <rdma/ib_umem.h> #include <linux/in.h> @@ -457,8 +459,17 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, int max_rq_sg; int max_sq_sg; u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz); + struct mlx5_ib_query_device_resp resp = {}; + size_t resp_len; + u64 max_tso; - if (uhw->inlen || uhw->outlen) + resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length); + if (uhw->outlen && uhw->outlen < resp_len) + return -EINVAL; + else + resp.response_length = resp_len; + + if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen)) return -EINVAL; memset(props, 0, sizeof(*props)); @@ -511,10 +522,21 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (MLX5_CAP_GEN(mdev, block_lb_mc)) props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; - if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && - (MLX5_CAP_ETH(dev->mdev, csum_cap))) + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) { + if (MLX5_CAP_ETH(mdev, csum_cap)) props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM; + if (field_avail(typeof(resp), tso_caps, uhw->outlen)) { + max_tso = MLX5_CAP_ETH(mdev, max_lso_cap); + if (max_tso) { + resp.tso_caps.max_tso = 1 << max_tso; + resp.tso_caps.supported_qpts |= + 1 << IB_QPT_RAW_PACKET; + resp.response_length += sizeof(resp.tso_caps); + } + } + } + if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) { props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; props->device_cap_flags |= IB_DEVICE_UD_TSO; @@ -576,6 +598,13 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (!mlx5_core_is_pf(mdev)) props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION; + if (uhw->outlen) { + err = ib_copy_to_udata(uhw, &resp, resp.response_length); + + if (err) + return err; + } + return 0; } @@ -983,6 +1012,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, goto out_uars; } + INIT_LIST_HEAD(&context->vma_private_list); INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); @@ -992,6 +1022,11 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, if (field_avail(typeof(resp), cqe_version, udata->outlen)) resp.response_length += sizeof(resp.cqe_version); + if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) { + resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE; + resp.response_length += sizeof(resp.cmds_supp_uhw); + } + /* * We don't want to expose information from the PCI bar that is located * after 4096 bytes, so if the arch only supports larger pages, let's @@ -1006,8 +1041,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE; resp.response_length += sizeof(resp.hca_core_clock_offset) + - sizeof(resp.reserved2) + - sizeof(resp.reserved3); + sizeof(resp.reserved2); } err = ib_copy_to_udata(udata, &resp, resp.response_length); @@ -1086,6 +1120,125 @@ static int get_index(unsigned long offset) return get_arg(offset); } +static void mlx5_ib_vma_open(struct vm_area_struct *area) +{ + /* vma_open is called when a new VMA is created on top of our VMA. This + * is done through either mremap flow or split_vma (usually due to + * mlock, madvise, munmap, etc.) We do not support a clone of the VMA, + * as this VMA is strongly hardware related. Therefore we set the + * vm_ops of the newly created/cloned VMA to NULL, to prevent it from + * calling us again and trying to do incorrect actions. We assume that + * the original VMA size is exactly a single page, and therefore all + * "splitting" operation will not happen to it. + */ + area->vm_ops = NULL; +} + +static void mlx5_ib_vma_close(struct vm_area_struct *area) +{ + struct mlx5_ib_vma_private_data *mlx5_ib_vma_priv_data; + + /* It's guaranteed that all VMAs opened on a FD are closed before the + * file itself is closed, therefore no sync is needed with the regular + * closing flow. (e.g. mlx5 ib_dealloc_ucontext) + * However need a sync with accessing the vma as part of + * mlx5_ib_disassociate_ucontext. + * The close operation is usually called under mm->mmap_sem except when + * process is exiting. + * The exiting case is handled explicitly as part of + * mlx5_ib_disassociate_ucontext. + */ + mlx5_ib_vma_priv_data = (struct mlx5_ib_vma_private_data *)area->vm_private_data; + + /* setting the vma context pointer to null in the mlx5_ib driver's + * private data, to protect a race condition in + * mlx5_ib_disassociate_ucontext(). + */ + mlx5_ib_vma_priv_data->vma = NULL; + list_del(&mlx5_ib_vma_priv_data->list); + kfree(mlx5_ib_vma_priv_data); +} + +static const struct vm_operations_struct mlx5_ib_vm_ops = { + .open = mlx5_ib_vma_open, + .close = mlx5_ib_vma_close +}; + +static int mlx5_ib_set_vma_data(struct vm_area_struct *vma, + struct mlx5_ib_ucontext *ctx) +{ + struct mlx5_ib_vma_private_data *vma_prv; + struct list_head *vma_head = &ctx->vma_private_list; + + vma_prv = kzalloc(sizeof(*vma_prv), GFP_KERNEL); + if (!vma_prv) + return -ENOMEM; + + vma_prv->vma = vma; + vma->vm_private_data = vma_prv; + vma->vm_ops = &mlx5_ib_vm_ops; + + list_add(&vma_prv->list, vma_head); + + return 0; +} + +static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ + int ret; + struct vm_area_struct *vma; + struct mlx5_ib_vma_private_data *vma_private, *n; + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct task_struct *owning_process = NULL; + struct mm_struct *owning_mm = NULL; + + owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID); + if (!owning_process) + return; + + owning_mm = get_task_mm(owning_process); + if (!owning_mm) { + pr_info("no mm, disassociate ucontext is pending task termination\n"); + while (1) { + put_task_struct(owning_process); + usleep_range(1000, 2000); + owning_process = get_pid_task(ibcontext->tgid, + PIDTYPE_PID); + if (!owning_process || + owning_process->state == TASK_DEAD) { + pr_info("disassociate ucontext done, task was terminated\n"); + /* in case task was dead need to release the + * task struct. + */ + if (owning_process) + put_task_struct(owning_process); + return; + } + } + } + + /* need to protect from a race on closing the vma as part of + * mlx5_ib_vma_close. + */ + down_read(&owning_mm->mmap_sem); + list_for_each_entry_safe(vma_private, n, &context->vma_private_list, + list) { + vma = vma_private->vma; + ret = zap_vma_ptes(vma, vma->vm_start, + PAGE_SIZE); + WARN_ONCE(ret, "%s: zap_vma_ptes failed", __func__); + /* context going to be destroyed, should + * not access ops any more. + */ + vma->vm_ops = NULL; + list_del(&vma_private->list); + kfree(vma_private); + } + up_read(&owning_mm->mmap_sem); + mmput(owning_mm); + put_task_struct(owning_process); +} + static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) { switch (cmd) { @@ -1101,8 +1254,10 @@ static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) } static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, - struct vm_area_struct *vma, struct mlx5_uuar_info *uuari) + struct vm_area_struct *vma, + struct mlx5_ib_ucontext *context) { + struct mlx5_uuar_info *uuari = &context->uuari; int err; unsigned long idx; phys_addr_t pfn, pa; @@ -1152,14 +1307,13 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd), vma->vm_start, &pa); - return 0; + return mlx5_ib_set_vma_data(vma, context); } static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); - struct mlx5_uuar_info *uuari = &context->uuari; unsigned long command; phys_addr_t pfn; @@ -1168,7 +1322,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm case MLX5_IB_MMAP_WC_PAGE: case MLX5_IB_MMAP_NC_PAGE: case MLX5_IB_MMAP_REGULAR_PAGE: - return uar_mmap(dev, command, vma, uuari); + return uar_mmap(dev, command, vma, context); case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: return -ENOSYS; @@ -1331,6 +1485,32 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v, &ib_spec->ipv4.val.dst_ip, sizeof(ib_spec->ipv4.val.dst_ip)); break; + case IB_FLOW_SPEC_IPV6: + if (ib_spec->size != sizeof(ib_spec->ipv6)) + return -EINVAL; + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + ethertype, 0xffff); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + ethertype, ETH_P_IPV6); + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &ib_spec->ipv6.mask.src_ip, + sizeof(ib_spec->ipv6.mask.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &ib_spec->ipv6.val.src_ip, + sizeof(ib_spec->ipv6.val.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &ib_spec->ipv6.mask.dst_ip, + sizeof(ib_spec->ipv6.mask.dst_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &ib_spec->ipv6.val.dst_ip, + sizeof(ib_spec->ipv6.val.dst_ip)); + break; case IB_FLOW_SPEC_TCP: if (ib_spec->size != sizeof(ib_spec->tcp_udp)) return -EINVAL; @@ -1801,15 +1981,6 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr, return sprintf(buf, "MT%d\n", dev->mdev->pdev->device); } -static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, - char *buf) -{ - struct mlx5_ib_dev *dev = - container_of(device, struct mlx5_ib_dev, ib_dev.dev); - return sprintf(buf, "%d.%d.%04d\n", fw_rev_maj(dev->mdev), - fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev)); -} - static ssize_t show_rev(struct device *device, struct device_attribute *attr, char *buf) { @@ -1828,7 +1999,6 @@ static ssize_t show_board(struct device *device, struct device_attribute *attr, } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL); @@ -1836,7 +2006,6 @@ static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL); static struct device_attribute *mlx5_class_attributes[] = { &dev_attr_hw_rev, - &dev_attr_fw_ver, &dev_attr_hca_type, &dev_attr_board_id, &dev_attr_fw_pages, @@ -1854,6 +2023,65 @@ static void pkey_change_handler(struct work_struct *work) mutex_unlock(&ports->devr->mutex); } +static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev) +{ + struct mlx5_ib_qp *mqp; + struct mlx5_ib_cq *send_mcq, *recv_mcq; + struct mlx5_core_cq *mcq; + struct list_head cq_armed_list; + unsigned long flags_qp; + unsigned long flags_cq; + unsigned long flags; + + INIT_LIST_HEAD(&cq_armed_list); + + /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/ + spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags); + list_for_each_entry(mqp, &ibdev->qp_list, qps_list) { + spin_lock_irqsave(&mqp->sq.lock, flags_qp); + if (mqp->sq.tail != mqp->sq.head) { + send_mcq = to_mcq(mqp->ibqp.send_cq); + spin_lock_irqsave(&send_mcq->lock, flags_cq); + if (send_mcq->mcq.comp && + mqp->ibqp.send_cq->comp_handler) { + if (!send_mcq->mcq.reset_notify_added) { + send_mcq->mcq.reset_notify_added = 1; + list_add_tail(&send_mcq->mcq.reset_notify, + &cq_armed_list); + } + } + spin_unlock_irqrestore(&send_mcq->lock, flags_cq); + } + spin_unlock_irqrestore(&mqp->sq.lock, flags_qp); + spin_lock_irqsave(&mqp->rq.lock, flags_qp); + /* no handling is needed for SRQ */ + if (!mqp->ibqp.srq) { + if (mqp->rq.tail != mqp->rq.head) { + recv_mcq = to_mcq(mqp->ibqp.recv_cq); + spin_lock_irqsave(&recv_mcq->lock, flags_cq); + if (recv_mcq->mcq.comp && + mqp->ibqp.recv_cq->comp_handler) { + if (!recv_mcq->mcq.reset_notify_added) { + recv_mcq->mcq.reset_notify_added = 1; + list_add_tail(&recv_mcq->mcq.reset_notify, + &cq_armed_list); + } + } + spin_unlock_irqrestore(&recv_mcq->lock, + flags_cq); + } + } + spin_unlock_irqrestore(&mqp->rq.lock, flags_qp); + } + /*At that point all inflight post send were put to be executed as of we + * lock/unlock above locks Now need to arm all involved CQs. + */ + list_for_each_entry(mcq, &cq_armed_list, reset_notify) { + mcq->comp(mcq); + } + spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); +} + static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, enum mlx5_dev_event event, unsigned long param) { @@ -1866,6 +2094,7 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, case MLX5_DEV_EVENT_SYS_ERROR: ibdev->ib_active = false; ibev.event = IB_EVENT_DEVICE_FATAL; + mlx5_ib_handle_internal_error(ibdev); break; case MLX5_DEV_EVENT_PORT_UP: @@ -2272,6 +2501,15 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } +static void get_dev_fw_str(struct ib_device *ibdev, char *str, + size_t str_len) +{ + struct mlx5_ib_dev *dev = + container_of(ibdev, struct mlx5_ib_dev, ib_dev); + snprintf(str, str_len, "%d.%d.%04d", fw_rev_maj(dev->mdev), + fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev)); +} + static int mlx5_enable_roce(struct mlx5_ib_dev *dev) { int err; @@ -2298,6 +2536,113 @@ static void mlx5_disable_roce(struct mlx5_ib_dev *dev) unregister_netdevice_notifier(&dev->roce.nb); } +static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev) +{ + unsigned int i; + + for (i = 0; i < dev->num_ports; i++) + mlx5_core_dealloc_q_counter(dev->mdev, + dev->port[i].q_cnt_id); +} + +static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev) +{ + int i; + int ret; + + for (i = 0; i < dev->num_ports; i++) { + ret = mlx5_core_alloc_q_counter(dev->mdev, + &dev->port[i].q_cnt_id); + if (ret) { + mlx5_ib_warn(dev, + "couldn't allocate queue counter for port %d, err %d\n", + i + 1, ret); + goto dealloc_counters; + } + } + + return 0; + +dealloc_counters: + while (--i >= 0) + mlx5_core_dealloc_q_counter(dev->mdev, + dev->port[i].q_cnt_id); + + return ret; +} + +static const char * const names[] = { + "rx_write_requests", + "rx_read_requests", + "rx_atomic_requests", + "out_of_buffer", + "out_of_sequence", + "duplicate_request", + "rnr_nak_retry_err", + "packet_seq_err", + "implied_nak_seq_err", + "local_ack_timeout_err", +}; + +static const size_t stats_offsets[] = { + MLX5_BYTE_OFF(query_q_counter_out, rx_write_requests), + MLX5_BYTE_OFF(query_q_counter_out, rx_read_requests), + MLX5_BYTE_OFF(query_q_counter_out, rx_atomic_requests), + MLX5_BYTE_OFF(query_q_counter_out, out_of_buffer), + MLX5_BYTE_OFF(query_q_counter_out, out_of_sequence), + MLX5_BYTE_OFF(query_q_counter_out, duplicate_request), + MLX5_BYTE_OFF(query_q_counter_out, rnr_nak_retry_err), + MLX5_BYTE_OFF(query_q_counter_out, packet_seq_err), + MLX5_BYTE_OFF(query_q_counter_out, implied_nak_seq_err), + MLX5_BYTE_OFF(query_q_counter_out, local_ack_timeout_err), +}; + +static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev, + u8 port_num) +{ + BUILD_BUG_ON(ARRAY_SIZE(names) != ARRAY_SIZE(stats_offsets)); + + /* We support only per port stats */ + if (port_num == 0) + return NULL; + + return rdma_alloc_hw_stats_struct(names, ARRAY_SIZE(names), + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + +static int mlx5_ib_get_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + u8 port, int index) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out); + void *out; + __be32 val; + int ret; + int i; + + if (!port || !stats) + return -ENOSYS; + + out = mlx5_vzalloc(outlen); + if (!out) + return -ENOMEM; + + ret = mlx5_core_query_q_counter(dev->mdev, + dev->port[port - 1].q_cnt_id, 0, + out, outlen); + if (ret) + goto free; + + for (i = 0; i < ARRAY_SIZE(names); i++) { + val = *(__be32 *)(out + stats_offsets[i]); + stats->value[i] = (u64)be32_to_cpu(val); + } +free: + kvfree(out); + return ARRAY_SIZE(names); +} + static void *mlx5_ib_add(struct mlx5_core_dev *mdev) { struct mlx5_ib_dev *dev; @@ -2320,10 +2665,15 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->mdev = mdev; + dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port), + GFP_KERNEL); + if (!dev->port) + goto err_dealloc; + rwlock_init(&dev->roce.netdev_lock); err = get_port_caps(dev); if (err) - goto err_dealloc; + goto err_free_port; if (mlx5_use_mad_ifc(dev)) get_ext_port_caps(dev); @@ -2418,6 +2768,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->ib_dev.map_mr_sg = mlx5_ib_map_mr_sg; dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; dev->ib_dev.get_port_immutable = mlx5_port_immutable; + dev->ib_dev.get_dev_fw_str = get_dev_fw_str; if (mlx5_core_is_pf(mdev)) { dev->ib_dev.get_vf_config = mlx5_ib_get_vf_config; dev->ib_dev.set_vf_link_state = mlx5_ib_set_vf_link_state; @@ -2425,6 +2776,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->ib_dev.set_vf_guid = mlx5_ib_set_vf_guid; } + dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext; + mlx5_ib_internal_fill_odp_caps(dev); if (MLX5_CAP_GEN(mdev, imaicl)) { @@ -2435,6 +2788,12 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); } + if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) && + MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) { + dev->ib_dev.get_hw_stats = mlx5_ib_get_hw_stats; + dev->ib_dev.alloc_hw_stats = mlx5_ib_alloc_hw_stats; + } + if (MLX5_CAP_GEN(mdev, xrc)) { dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; @@ -2447,9 +2806,19 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) IB_LINK_LAYER_ETHERNET) { dev->ib_dev.create_flow = mlx5_ib_create_flow; dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow; + dev->ib_dev.create_wq = mlx5_ib_create_wq; + dev->ib_dev.modify_wq = mlx5_ib_modify_wq; + dev->ib_dev.destroy_wq = mlx5_ib_destroy_wq; + dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table; + dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table; dev->ib_dev.uverbs_ex_cmd_mask |= (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | - (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW) | + (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL); } err = init_node_data(dev); if (err) @@ -2457,6 +2826,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) mutex_init(&dev->flow_db.lock); mutex_init(&dev->cap_mask_mutex); + INIT_LIST_HEAD(&dev->qp_list); + spin_lock_init(&dev->reset_flow_resource_lock); if (ll == IB_LINK_LAYER_ETHERNET) { err = mlx5_enable_roce(dev); @@ -2472,10 +2843,14 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) if (err) goto err_rsrc; - err = ib_register_device(&dev->ib_dev, NULL); + err = mlx5_ib_alloc_q_counters(dev); if (err) goto err_odp; + err = ib_register_device(&dev->ib_dev, NULL); + if (err) + goto err_q_cnt; + err = create_umr_res(dev); if (err) goto err_dev; @@ -2497,6 +2872,9 @@ err_umrc: err_dev: ib_unregister_device(&dev->ib_dev); +err_q_cnt: + mlx5_ib_dealloc_q_counters(dev); + err_odp: mlx5_ib_odp_remove_one(dev); @@ -2507,6 +2885,9 @@ err_disable_roce: if (ll == IB_LINK_LAYER_ETHERNET) mlx5_disable_roce(dev); +err_free_port: + kfree(dev->port); + err_dealloc: ib_dealloc_device((struct ib_device *)dev); @@ -2519,11 +2900,13 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1); ib_unregister_device(&dev->ib_dev); + mlx5_ib_dealloc_q_counters(dev); destroy_umrc_res(dev); mlx5_ib_odp_remove_one(dev); destroy_dev_resources(&dev->devr); if (ll == IB_LINK_LAYER_ETHERNET) mlx5_disable_roce(dev); + kfree(dev->port); ib_dealloc_device(&dev->ib_dev); } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index c4a9825828bc..372385d0f993 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -105,6 +105,11 @@ enum { MLX5_CQE_VERSION_V1, }; +struct mlx5_ib_vma_private_data { + struct list_head list; + struct vm_area_struct *vma; +}; + struct mlx5_ib_ucontext { struct ib_ucontext ibucontext; struct list_head db_page_list; @@ -116,6 +121,7 @@ struct mlx5_ib_ucontext { u8 cqe_version; /* Transport Domain number */ u32 tdn; + struct list_head vma_private_list; }; static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) @@ -217,12 +223,41 @@ struct mlx5_ib_wq { void *qend; }; +struct mlx5_ib_rwq { + struct ib_wq ibwq; + u32 rqn; + u32 rq_num_pas; + u32 log_rq_stride; + u32 log_rq_size; + u32 rq_page_offset; + u32 log_page_size; + struct ib_umem *umem; + size_t buf_size; + unsigned int page_shift; + int create_type; + struct mlx5_db db; + u32 user_index; + u32 wqe_count; + u32 wqe_shift; + int wq_sig; +}; + enum { MLX5_QP_USER, MLX5_QP_KERNEL, MLX5_QP_EMPTY }; +enum { + MLX5_WQ_USER, + MLX5_WQ_KERNEL +}; + +struct mlx5_ib_rwq_ind_table { + struct ib_rwq_ind_table ib_rwq_ind_tbl; + u32 rqtn; +}; + /* * Connect-IB can trigger up to four concurrent pagefaults * per-QP. @@ -266,6 +301,10 @@ struct mlx5_ib_qp_trans { u8 resp_depth; }; +struct mlx5_ib_rss_qp { + u32 tirn; +}; + struct mlx5_ib_rq { struct mlx5_ib_qp_base base; struct mlx5_ib_wq *rq; @@ -294,6 +333,7 @@ struct mlx5_ib_qp { union { struct mlx5_ib_qp_trans trans_qp; struct mlx5_ib_raw_packet_qp raw_packet_qp; + struct mlx5_ib_rss_qp rss_qp; }; struct mlx5_buf buf; @@ -340,6 +380,9 @@ struct mlx5_ib_qp { spinlock_t disable_page_faults_lock; struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS]; #endif + struct list_head qps_list; + struct list_head cq_recv_list; + struct list_head cq_send_list; }; struct mlx5_ib_cq_buf { @@ -401,6 +444,8 @@ struct mlx5_ib_cq { struct mlx5_ib_cq_buf *resize_buf; struct ib_umem *resize_umem; int cqe_size; + struct list_head list_send_qp; + struct list_head list_recv_qp; u32 create_flags; struct list_head wc_list; enum ib_cq_notify_flags notify_flags; @@ -546,6 +591,10 @@ struct mlx5_ib_resources { struct mutex mutex; }; +struct mlx5_ib_port { + u16 q_cnt_id; +}; + struct mlx5_roce { /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL * netdev pointer @@ -581,6 +630,11 @@ struct mlx5_ib_dev { struct srcu_struct mr_srcu; #endif struct mlx5_ib_flow_db flow_db; + /* protect resources needed as part of reset flow */ + spinlock_t reset_flow_resource_lock; + struct list_head qp_list; + /* Array with num_ports elements */ + struct mlx5_ib_port *port; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -628,6 +682,16 @@ static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp) return container_of(ibqp, struct mlx5_ib_qp, ibqp); } +static inline struct mlx5_ib_rwq *to_mrwq(struct ib_wq *ibwq) +{ + return container_of(ibwq, struct mlx5_ib_rwq, ibwq); +} + +static inline struct mlx5_ib_rwq_ind_table *to_mrwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) +{ + return container_of(ib_rwq_ind_tbl, struct mlx5_ib_rwq_ind_table, ib_rwq_ind_tbl); +} + static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq) { return container_of(msrq, struct mlx5_ib_srq, msrq); @@ -762,6 +826,16 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift); int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status); +struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_destroy_wq(struct ib_wq *wq); +int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata); +struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING extern struct workqueue_struct *mlx5_ib_page_fault_wq; diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 8cf2ce50511f..4b021305c321 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1193,12 +1193,16 @@ error: static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { + struct mlx5_core_dev *mdev = dev->mdev; struct umr_common *umrc = &dev->umrc; struct mlx5_ib_umr_context umr_context; struct mlx5_umr_wr umrwr = {}; struct ib_send_wr *bad; int err; + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + return 0; + mlx5_ib_init_umr_context(&umr_context); umrwr.wr.wr_cqe = &umr_context.cqe; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index ce0a7ab35a22..0dd7d93cac95 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -77,6 +77,10 @@ struct mlx5_wqe_eth_pad { u8 rsvd0[16]; }; +static void get_cqs(enum ib_qp_type qp_type, + struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq, + struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq); + static int is_qp0(enum ib_qp_type qp_type) { return qp_type == IB_QPT_SMI; @@ -609,6 +613,11 @@ static int to_mlx5_st(enum ib_qp_type type) } } +static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, + struct mlx5_ib_cq *recv_cq); +static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, + struct mlx5_ib_cq *recv_cq); + static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn) { return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index; @@ -649,6 +658,71 @@ err_umem: return err; } +static void destroy_user_rq(struct ib_pd *pd, struct mlx5_ib_rwq *rwq) +{ + struct mlx5_ib_ucontext *context; + + context = to_mucontext(pd->uobject->context); + mlx5_ib_db_unmap_user(context, &rwq->db); + if (rwq->umem) + ib_umem_release(rwq->umem); +} + +static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_rwq *rwq, + struct mlx5_ib_create_wq *ucmd) +{ + struct mlx5_ib_ucontext *context; + int page_shift = 0; + int npages; + u32 offset = 0; + int ncont = 0; + int err; + + if (!ucmd->buf_addr) + return -EINVAL; + + context = to_mucontext(pd->uobject->context); + rwq->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr, + rwq->buf_size, 0, 0); + if (IS_ERR(rwq->umem)) { + mlx5_ib_dbg(dev, "umem_get failed\n"); + err = PTR_ERR(rwq->umem); + return err; + } + + mlx5_ib_cont_pages(rwq->umem, ucmd->buf_addr, &npages, &page_shift, + &ncont, NULL); + err = mlx5_ib_get_buf_offset(ucmd->buf_addr, page_shift, + &rwq->rq_page_offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + + rwq->rq_num_pas = ncont; + rwq->page_shift = page_shift; + rwq->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + rwq->wq_sig = !!(ucmd->flags & MLX5_WQ_FLAG_SIGNATURE); + + mlx5_ib_dbg(dev, "addr 0x%llx, size %zd, npages %d, page_shift %d, ncont %d, offset %d\n", + (unsigned long long)ucmd->buf_addr, rwq->buf_size, + npages, page_shift, ncont, offset); + + err = mlx5_ib_db_map_user(context, ucmd->db_addr, &rwq->db); + if (err) { + mlx5_ib_dbg(dev, "map failed\n"); + goto err_umem; + } + + rwq->create_type = MLX5_WQ_USER; + return 0; + +err_umem: + ib_umem_release(rwq->umem); + return err; +} + static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_qp *qp, struct ib_udata *udata, struct ib_qp_init_attr *attr, @@ -1201,6 +1275,187 @@ static void raw_packet_qp_copy_info(struct mlx5_ib_qp *qp, rq->doorbell = &qp->db; } +static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + mlx5_core_destroy_tir(dev->mdev, qp->rss_qp.tirn); +} + +static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct ib_uobject *uobj = pd->uobject; + struct ib_ucontext *ucontext = uobj->context; + struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext); + struct mlx5_ib_create_qp_resp resp = {}; + int inlen; + int err; + u32 *in; + void *tirc; + void *hfso; + u32 selected_fields = 0; + size_t min_resp_len; + u32 tdn = mucontext->tdn; + struct mlx5_ib_create_qp_rss ucmd = {}; + size_t required_cmd_sz; + + if (init_attr->qp_type != IB_QPT_RAW_PACKET) + return -EOPNOTSUPP; + + if (init_attr->create_flags || init_attr->send_cq) + return -EINVAL; + + min_resp_len = offsetof(typeof(resp), uuar_index) + sizeof(resp.uuar_index); + if (udata->outlen < min_resp_len) + return -EINVAL; + + required_cmd_sz = offsetof(typeof(ucmd), reserved1) + sizeof(ucmd.reserved1); + if (udata->inlen < required_cmd_sz) { + mlx5_ib_dbg(dev, "invalid inlen\n"); + return -EINVAL; + } + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) { + mlx5_ib_dbg(dev, "inlen is not supported\n"); + return -EOPNOTSUPP; + } + + if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) { + mlx5_ib_dbg(dev, "copy failed\n"); + return -EFAULT; + } + + if (ucmd.comp_mask) { + mlx5_ib_dbg(dev, "invalid comp mask\n"); + return -EOPNOTSUPP; + } + + if (memchr_inv(ucmd.reserved, 0, sizeof(ucmd.reserved)) || ucmd.reserved1) { + mlx5_ib_dbg(dev, "invalid reserved\n"); + return -EOPNOTSUPP; + } + + err = ib_copy_to_udata(udata, &resp, min_resp_len); + if (err) { + mlx5_ib_dbg(dev, "copy failed\n"); + return -EINVAL; + } + + inlen = MLX5_ST_SZ_BYTES(create_tir_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); + MLX5_SET(tirc, tirc, disp_type, + MLX5_TIRC_DISP_TYPE_INDIRECT); + MLX5_SET(tirc, tirc, indirect_table, + init_attr->rwq_ind_tbl->ind_tbl_num); + MLX5_SET(tirc, tirc, transport_domain, tdn); + + hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer); + switch (ucmd.rx_hash_function) { + case MLX5_RX_HASH_FUNC_TOEPLITZ: + { + void *rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key); + size_t len = MLX5_FLD_SZ_BYTES(tirc, rx_hash_toeplitz_key); + + if (len != ucmd.rx_key_len) { + err = -EINVAL; + goto err; + } + + MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ); + MLX5_SET(tirc, tirc, rx_hash_symmetric, 1); + memcpy(rss_key, ucmd.rx_hash_key, len); + break; + } + default: + err = -EOPNOTSUPP; + goto err; + } + + if (!ucmd.rx_hash_fields_mask) { + /* special case when this TIR serves as steering entry without hashing */ + if (!init_attr->rwq_ind_tbl->log_ind_tbl_size) + goto create_tir; + err = -EINVAL; + goto err; + } + + if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) && + ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))) { + err = -EINVAL; + goto err; + } + + /* If none of IPV4 & IPV6 SRC/DST was set - this bit field is ignored */ + if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV4); + else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6)) + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV6); + + if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) && + ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))) { + err = -EINVAL; + goto err; + } + + /* If none of TCP & UDP SRC/DST was set - this bit field is ignored */ + if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) + MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, + MLX5_L4_PROT_TYPE_TCP); + else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) + MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, + MLX5_L4_PROT_TYPE_UDP); + + if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6)) + selected_fields |= MLX5_HASH_FIELD_SEL_SRC_IP; + + if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6)) + selected_fields |= MLX5_HASH_FIELD_SEL_DST_IP; + + if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP)) + selected_fields |= MLX5_HASH_FIELD_SEL_L4_SPORT; + + if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) + selected_fields |= MLX5_HASH_FIELD_SEL_L4_DPORT; + + MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields); + +create_tir: + err = mlx5_core_create_tir(dev->mdev, in, inlen, &qp->rss_qp.tirn); + + if (err) + goto err; + + kvfree(in); + /* qpn is reserved for that QP */ + qp->trans_qp.base.mqp.qpn = 0; + return 0; + +err: + kvfree(in); + return err; +} + static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, struct mlx5_ib_qp *qp) @@ -1211,6 +1466,9 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_create_qp_resp resp; struct mlx5_create_qp_mbox_in *in; struct mlx5_ib_create_qp ucmd; + struct mlx5_ib_cq *send_cq; + struct mlx5_ib_cq *recv_cq; + unsigned long flags; int inlen = sizeof(*in); int err; u32 uidx = MLX5_IB_DEFAULT_UIDX; @@ -1227,6 +1485,14 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); + if (init_attr->rwq_ind_tbl) { + if (!udata) + return -ENOSYS; + + err = create_rss_raw_qp_tir(dev, qp, pd, init_attr, udata); + return err; + } + if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { if (!MLX5_CAP_GEN(mdev, block_lb_mc)) { mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n"); @@ -1460,6 +1726,23 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, base->container_mibqp = qp; base->mqp.event = mlx5_ib_qp_event; + get_cqs(init_attr->qp_type, init_attr->send_cq, init_attr->recv_cq, + &send_cq, &recv_cq); + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + mlx5_ib_lock_cqs(send_cq, recv_cq); + /* Maintain device to QPs access, needed for further handling via reset + * flow + */ + list_add_tail(&qp->qps_list, &dev->qp_list); + /* Maintain CQ to QPs access, needed for further handling via reset flow + */ + if (send_cq) + list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp); + if (recv_cq) + list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp); + mlx5_ib_unlock_cqs(send_cq, recv_cq); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + return 0; err_create: @@ -1478,23 +1761,23 @@ static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv if (send_cq) { if (recv_cq) { if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { - spin_lock_irq(&send_cq->lock); + spin_lock(&send_cq->lock); spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { - spin_lock_irq(&send_cq->lock); + spin_lock(&send_cq->lock); __acquire(&recv_cq->lock); } else { - spin_lock_irq(&recv_cq->lock); + spin_lock(&recv_cq->lock); spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); } } else { - spin_lock_irq(&send_cq->lock); + spin_lock(&send_cq->lock); __acquire(&recv_cq->lock); } } else if (recv_cq) { - spin_lock_irq(&recv_cq->lock); + spin_lock(&recv_cq->lock); __acquire(&send_cq->lock); } else { __acquire(&send_cq->lock); @@ -1509,21 +1792,21 @@ static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *re if (recv_cq) { if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_unlock(&recv_cq->lock); - spin_unlock_irq(&send_cq->lock); + spin_unlock(&send_cq->lock); } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { __release(&recv_cq->lock); - spin_unlock_irq(&send_cq->lock); + spin_unlock(&send_cq->lock); } else { spin_unlock(&send_cq->lock); - spin_unlock_irq(&recv_cq->lock); + spin_unlock(&recv_cq->lock); } } else { __release(&recv_cq->lock); - spin_unlock_irq(&send_cq->lock); + spin_unlock(&send_cq->lock); } } else if (recv_cq) { __release(&send_cq->lock); - spin_unlock_irq(&recv_cq->lock); + spin_unlock(&recv_cq->lock); } else { __release(&recv_cq->lock); __release(&send_cq->lock); @@ -1535,17 +1818,18 @@ static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp) return to_mpd(qp->ibqp.pd); } -static void get_cqs(struct mlx5_ib_qp *qp, +static void get_cqs(enum ib_qp_type qp_type, + struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq, struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq) { - switch (qp->ibqp.qp_type) { + switch (qp_type) { case IB_QPT_XRC_TGT: *send_cq = NULL; *recv_cq = NULL; break; case MLX5_IB_QPT_REG_UMR: case IB_QPT_XRC_INI: - *send_cq = to_mcq(qp->ibqp.send_cq); + *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; *recv_cq = NULL; break; @@ -1557,8 +1841,8 @@ static void get_cqs(struct mlx5_ib_qp *qp, case IB_QPT_RAW_IPV6: case IB_QPT_RAW_ETHERTYPE: case IB_QPT_RAW_PACKET: - *send_cq = to_mcq(qp->ibqp.send_cq); - *recv_cq = to_mcq(qp->ibqp.recv_cq); + *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; + *recv_cq = ib_recv_cq ? to_mcq(ib_recv_cq) : NULL; break; case IB_QPT_MAX: @@ -1577,8 +1861,14 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) struct mlx5_ib_cq *send_cq, *recv_cq; struct mlx5_ib_qp_base *base = &qp->trans_qp.base; struct mlx5_modify_qp_mbox_in *in; + unsigned long flags; int err; + if (qp->ibqp.rwq_ind_tbl) { + destroy_rss_raw_qp_tir(dev, qp); + return; + } + base = qp->ibqp.qp_type == IB_QPT_RAW_PACKET ? &qp->raw_packet_qp.rq.base : &qp->trans_qp.base; @@ -1602,17 +1892,28 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) base->mqp.qpn); } - get_cqs(qp, &send_cq, &recv_cq); + get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, + &send_cq, &recv_cq); + + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + mlx5_ib_lock_cqs(send_cq, recv_cq); + /* del from lists under both locks above to protect reset flow paths */ + list_del(&qp->qps_list); + if (send_cq) + list_del(&qp->cq_send_list); + + if (recv_cq) + list_del(&qp->cq_recv_list); if (qp->create_type == MLX5_QP_KERNEL) { - mlx5_ib_lock_cqs(send_cq, recv_cq); __mlx5_ib_cq_clean(recv_cq, base->mqp.qpn, qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); if (send_cq != recv_cq) __mlx5_ib_cq_clean(send_cq, base->mqp.qpn, NULL); - mlx5_ib_unlock_cqs(send_cq, recv_cq); } + mlx5_ib_unlock_cqs(send_cq, recv_cq); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { destroy_raw_packet_qp(dev, qp); @@ -2300,7 +2601,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, } pd = get_pd(qp); - get_cqs(qp, &send_cq, &recv_cq); + get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, + &send_cq, &recv_cq); context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn); context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0; @@ -2349,6 +2651,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, else sqd_event = 0; + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num : + qp->port) - 1; + struct mlx5_ib_port *mibport = &dev->port[port_num]; + + context->qp_counter_set_usr_page |= + cpu_to_be32((u32)(mibport->q_cnt_id) << 24); + } + if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) context->sq_crq_size |= cpu_to_be16(1 << 4); @@ -2439,6 +2750,9 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int port; enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; + if (ibqp->rwq_ind_tbl) + return -ENOSYS; + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask); @@ -3397,6 +3711,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, { struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_ib_qp *qp; struct mlx5_ib_mr *mr; struct mlx5_wqe_data_seg *dpseg; @@ -3424,6 +3739,13 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, spin_lock_irqsave(&qp->sq.lock, flags); + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + err = -EIO; + *bad_wr = wr; + nreq = 0; + goto out; + } + for (nreq = 0; wr; nreq++, wr = wr->next) { if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) { mlx5_ib_warn(dev, "\n"); @@ -3725,6 +4047,8 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_wqe_data_seg *scat; struct mlx5_rwqe_sig *sig; + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_core_dev *mdev = dev->mdev; unsigned long flags; int err = 0; int nreq; @@ -3736,6 +4060,13 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, spin_lock_irqsave(&qp->rq.lock, flags); + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + err = -EIO; + *bad_wr = wr; + nreq = 0; + goto out; + } + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); for (nreq = 0; wr; nreq++, wr = wr->next) { @@ -4055,6 +4386,9 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int err = 0; u8 raw_packet_qp_state; + if (ibqp->rwq_ind_tbl) + return -ENOSYS; + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask, qp_init_attr); @@ -4164,3 +4498,322 @@ int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd) return 0; } + +static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, + struct ib_wq_init_attr *init_attr) +{ + struct mlx5_ib_dev *dev; + __be64 *rq_pas0; + void *in; + void *rqc; + void *wq; + int inlen; + int err; + + dev = to_mdev(pd->device); + + inlen = MLX5_ST_SZ_BYTES(create_rq_in) + sizeof(u64) * rwq->rq_num_pas; + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); + MLX5_SET(rqc, rqc, mem_rq_type, + MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE); + MLX5_SET(rqc, rqc, user_index, rwq->user_index); + MLX5_SET(rqc, rqc, cqn, to_mcq(init_attr->cq)->mcq.cqn); + MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); + MLX5_SET(rqc, rqc, flush_in_error_en, 1); + wq = MLX5_ADDR_OF(rqc, rqc, wq); + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN); + MLX5_SET(wq, wq, log_wq_stride, rwq->log_rq_stride); + MLX5_SET(wq, wq, log_wq_sz, rwq->log_rq_size); + MLX5_SET(wq, wq, pd, to_mpd(pd)->pdn); + MLX5_SET(wq, wq, page_offset, rwq->rq_page_offset); + MLX5_SET(wq, wq, log_wq_pg_sz, rwq->log_page_size); + MLX5_SET(wq, wq, wq_signature, rwq->wq_sig); + MLX5_SET64(wq, wq, dbr_addr, rwq->db.dma); + rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); + mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0); + err = mlx5_core_create_rq(dev->mdev, in, inlen, &rwq->rqn); + kvfree(in); + return err; +} + +static int set_user_rq_size(struct mlx5_ib_dev *dev, + struct ib_wq_init_attr *wq_init_attr, + struct mlx5_ib_create_wq *ucmd, + struct mlx5_ib_rwq *rwq) +{ + /* Sanity check RQ size before proceeding */ + if (wq_init_attr->max_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_wq_sz))) + return -EINVAL; + + if (!ucmd->rq_wqe_count) + return -EINVAL; + + rwq->wqe_count = ucmd->rq_wqe_count; + rwq->wqe_shift = ucmd->rq_wqe_shift; + rwq->buf_size = (rwq->wqe_count << rwq->wqe_shift); + rwq->log_rq_stride = rwq->wqe_shift; + rwq->log_rq_size = ilog2(rwq->wqe_count); + return 0; +} + +static int prepare_user_rq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata, + struct mlx5_ib_rwq *rwq) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_create_wq ucmd = {}; + int err; + size_t required_cmd_sz; + + required_cmd_sz = offsetof(typeof(ucmd), reserved) + sizeof(ucmd.reserved); + if (udata->inlen < required_cmd_sz) { + mlx5_ib_dbg(dev, "invalid inlen\n"); + return -EINVAL; + } + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) { + mlx5_ib_dbg(dev, "inlen is not supported\n"); + return -EOPNOTSUPP; + } + + if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) { + mlx5_ib_dbg(dev, "copy failed\n"); + return -EFAULT; + } + + if (ucmd.comp_mask) { + mlx5_ib_dbg(dev, "invalid comp mask\n"); + return -EOPNOTSUPP; + } + + if (ucmd.reserved) { + mlx5_ib_dbg(dev, "invalid reserved\n"); + return -EOPNOTSUPP; + } + + err = set_user_rq_size(dev, init_attr, &ucmd, rwq); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + return err; + } + + err = create_user_rq(dev, pd, rwq, &ucmd); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + if (err) + return err; + } + + rwq->user_index = ucmd.user_index; + return 0; +} + +struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev; + struct mlx5_ib_rwq *rwq; + struct mlx5_ib_create_wq_resp resp = {}; + size_t min_resp_len; + int err; + + if (!udata) + return ERR_PTR(-ENOSYS); + + min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved); + if (udata->outlen && udata->outlen < min_resp_len) + return ERR_PTR(-EINVAL); + + dev = to_mdev(pd->device); + switch (init_attr->wq_type) { + case IB_WQT_RQ: + rwq = kzalloc(sizeof(*rwq), GFP_KERNEL); + if (!rwq) + return ERR_PTR(-ENOMEM); + err = prepare_user_rq(pd, init_attr, udata, rwq); + if (err) + goto err; + err = create_rq(rwq, pd, init_attr); + if (err) + goto err_user_rq; + break; + default: + mlx5_ib_dbg(dev, "unsupported wq type %d\n", + init_attr->wq_type); + return ERR_PTR(-EINVAL); + } + + rwq->ibwq.wq_num = rwq->rqn; + rwq->ibwq.state = IB_WQS_RESET; + if (udata->outlen) { + resp.response_length = offsetof(typeof(resp), response_length) + + sizeof(resp.response_length); + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) + goto err_copy; + } + + return &rwq->ibwq; + +err_copy: + mlx5_core_destroy_rq(dev->mdev, rwq->rqn); +err_user_rq: + destroy_user_rq(pd, rwq); +err: + kfree(rwq); + return ERR_PTR(err); +} + +int mlx5_ib_destroy_wq(struct ib_wq *wq) +{ + struct mlx5_ib_dev *dev = to_mdev(wq->device); + struct mlx5_ib_rwq *rwq = to_mrwq(wq); + + mlx5_core_destroy_rq(dev->mdev, rwq->rqn); + destroy_user_rq(wq->pd, rwq); + kfree(rwq); + + return 0; +} + +struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_ib_rwq_ind_table *rwq_ind_tbl; + int sz = 1 << init_attr->log_ind_tbl_size; + struct mlx5_ib_create_rwq_ind_tbl_resp resp = {}; + size_t min_resp_len; + int inlen; + int err; + int i; + u32 *in; + void *rqtc; + + if (udata->inlen > 0 && + !ib_is_udata_cleared(udata, 0, + udata->inlen)) + return ERR_PTR(-EOPNOTSUPP); + + min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved); + if (udata->outlen && udata->outlen < min_resp_len) + return ERR_PTR(-EINVAL); + + rwq_ind_tbl = kzalloc(sizeof(*rwq_ind_tbl), GFP_KERNEL); + if (!rwq_ind_tbl) + return ERR_PTR(-ENOMEM); + + inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz; + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto err; + } + + rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context); + + MLX5_SET(rqtc, rqtc, rqt_actual_size, sz); + MLX5_SET(rqtc, rqtc, rqt_max_size, sz); + + for (i = 0; i < sz; i++) + MLX5_SET(rqtc, rqtc, rq_num[i], init_attr->ind_tbl[i]->wq_num); + + err = mlx5_core_create_rqt(dev->mdev, in, inlen, &rwq_ind_tbl->rqtn); + kvfree(in); + + if (err) + goto err; + + rwq_ind_tbl->ib_rwq_ind_tbl.ind_tbl_num = rwq_ind_tbl->rqtn; + if (udata->outlen) { + resp.response_length = offsetof(typeof(resp), response_length) + + sizeof(resp.response_length); + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) + goto err_copy; + } + + return &rwq_ind_tbl->ib_rwq_ind_tbl; + +err_copy: + mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn); +err: + kfree(rwq_ind_tbl); + return ERR_PTR(err); +} + +int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) +{ + struct mlx5_ib_rwq_ind_table *rwq_ind_tbl = to_mrwq_ind_table(ib_rwq_ind_tbl); + struct mlx5_ib_dev *dev = to_mdev(ib_rwq_ind_tbl->device); + + mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn); + + kfree(rwq_ind_tbl); + return 0; +} + +int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(wq->device); + struct mlx5_ib_rwq *rwq = to_mrwq(wq); + struct mlx5_ib_modify_wq ucmd = {}; + size_t required_cmd_sz; + int curr_wq_state; + int wq_state; + int inlen; + int err; + void *rqc; + void *in; + + required_cmd_sz = offsetof(typeof(ucmd), reserved) + sizeof(ucmd.reserved); + if (udata->inlen < required_cmd_sz) + return -EINVAL; + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) + return -EOPNOTSUPP; + + if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) + return -EFAULT; + + if (ucmd.comp_mask || ucmd.reserved) + return -EOPNOTSUPP; + + inlen = MLX5_ST_SZ_BYTES(modify_rq_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); + + curr_wq_state = (wq_attr_mask & IB_WQ_CUR_STATE) ? + wq_attr->curr_wq_state : wq->state; + wq_state = (wq_attr_mask & IB_WQ_STATE) ? + wq_attr->wq_state : curr_wq_state; + if (curr_wq_state == IB_WQS_ERR) + curr_wq_state = MLX5_RQC_STATE_ERR; + if (wq_state == IB_WQS_ERR) + wq_state = MLX5_RQC_STATE_ERR; + MLX5_SET(modify_rq_in, in, rq_state, curr_wq_state); + MLX5_SET(rqc, rqc, state, wq_state); + + err = mlx5_core_modify_rq(dev->mdev, rwq->rqn, in, inlen); + kvfree(in); + if (!err) + rwq->ibwq.state = (wq_state == MLX5_RQC_STATE_ERR) ? IB_WQS_ERR : wq_state; + + return err; +} diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 3b2ddd64a371..ed6ac52355f1 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -74,14 +74,12 @@ static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type) } static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, - struct mlx5_create_srq_mbox_in **in, - struct ib_udata *udata, int buf_size, int *inlen, - int is_xrc) + struct mlx5_srq_attr *in, + struct ib_udata *udata, int buf_size) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_create_srq ucmd = {}; size_t ucmdlen; - void *xsrqc; int err; int npages; int page_shift; @@ -104,7 +102,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, udata->inlen - sizeof(ucmd))) return -EINVAL; - if (is_xrc) { + if (in->type == IB_SRQT_XRC) { err = get_srq_user_index(to_mucontext(pd->uobject->context), &ucmd, udata->inlen, &uidx); if (err) @@ -130,14 +128,13 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, goto err_umem; } - *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; - *in = mlx5_vzalloc(*inlen); - if (!(*in)) { + in->pas = mlx5_vzalloc(sizeof(*in->pas) * ncont); + if (!in->pas) { err = -ENOMEM; goto err_umem; } - mlx5_ib_populate_pas(dev, srq->umem, page_shift, (*in)->pas, 0); + mlx5_ib_populate_pas(dev, srq->umem, page_shift, in->pas, 0); err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), ucmd.db_addr, &srq->db); @@ -146,20 +143,16 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, goto err_in; } - (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; - (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26); - - if ((MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) && - is_xrc){ - xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in, - xrc_srq_context_entry); - MLX5_SET(xrc_srqc, xsrqc, user_index, uidx); - } + in->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + in->page_offset = offset; + if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 && + in->type == IB_SRQT_XRC) + in->user_index = uidx; return 0; err_in: - kvfree(*in); + kvfree(in->pas); err_umem: ib_umem_release(srq->umem); @@ -168,15 +161,13 @@ err_umem: } static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, - struct mlx5_create_srq_mbox_in **in, int buf_size, - int *inlen, int is_xrc) + struct mlx5_srq_attr *in, int buf_size) { int err; int i; struct mlx5_wqe_srq_next_seg *next; int page_shift; int npages; - void *xsrqc; err = mlx5_db_alloc(dev->mdev, &srq->db); if (err) { @@ -204,13 +195,12 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, npages = DIV_ROUND_UP(srq->buf.npages, 1 << (page_shift - PAGE_SHIFT)); mlx5_ib_dbg(dev, "buf_size %d, page_shift %d, npages %d, calc npages %d\n", buf_size, page_shift, srq->buf.npages, npages); - *inlen = sizeof(**in) + sizeof(*(*in)->pas) * npages; - *in = mlx5_vzalloc(*inlen); - if (!*in) { + in->pas = mlx5_vzalloc(sizeof(*in->pas) * npages); + if (!in->pas) { err = -ENOMEM; goto err_buf; } - mlx5_fill_page_array(&srq->buf, (*in)->pas); + mlx5_fill_page_array(&srq->buf, in->pas); srq->wrid = kmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL); if (!srq->wrid) { @@ -221,20 +211,15 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, } srq->wq_sig = !!srq_signature; - (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; - - if ((MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) && - is_xrc){ - xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in, - xrc_srq_context_entry); - /* 0xffffff means we ask to work with cqe version 0 */ - MLX5_SET(xrc_srqc, xsrqc, user_index, MLX5_IB_DEFAULT_UIDX); - } + in->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 && + in->type == IB_SRQT_XRC) + in->user_index = MLX5_IB_DEFAULT_UIDX; return 0; err_in: - kvfree(*in); + kvfree(in->pas); err_buf: mlx5_buf_free(dev->mdev, &srq->buf); @@ -267,10 +252,7 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, int desc_size; int buf_size; int err; - struct mlx5_create_srq_mbox_in *uninitialized_var(in); - int uninitialized_var(inlen); - int is_xrc; - u32 flgs, xrcdn; + struct mlx5_srq_attr in = {0}; __u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); /* Sanity check SRQ size before proceeding */ @@ -302,14 +284,10 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs, srq->msrq.max_avail_gather); - is_xrc = (init_attr->srq_type == IB_SRQT_XRC); - if (pd->uobject) - err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen, - is_xrc); + err = create_srq_user(pd, srq, &in, udata, buf_size); else - err = create_srq_kernel(dev, srq, &in, buf_size, &inlen, - is_xrc); + err = create_srq_kernel(dev, srq, &in, buf_size); if (err) { mlx5_ib_warn(dev, "create srq %s failed, err %d\n", @@ -317,23 +295,23 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, goto err_srq; } - in->ctx.state_log_sz = ilog2(srq->msrq.max); - flgs = ((srq->msrq.wqe_shift - 4) | (is_xrc << 5) | (srq->wq_sig << 7)) << 24; - xrcdn = 0; - if (is_xrc) { - xrcdn = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn; - in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(init_attr->ext.xrc.cq)->mcq.cqn); + in.type = init_attr->srq_type; + in.log_size = ilog2(srq->msrq.max); + in.wqe_shift = srq->msrq.wqe_shift - 4; + if (srq->wq_sig) + in.flags |= MLX5_SRQ_FLAG_WQ_SIG; + if (init_attr->srq_type == IB_SRQT_XRC) { + in.xrcd = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn; + in.cqn = to_mcq(init_attr->ext.xrc.cq)->mcq.cqn; } else if (init_attr->srq_type == IB_SRQT_BASIC) { - xrcdn = to_mxrcd(dev->devr.x0)->xrcdn; - in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(dev->devr.c0)->mcq.cqn); + in.xrcd = to_mxrcd(dev->devr.x0)->xrcdn; + in.cqn = to_mcq(dev->devr.c0)->mcq.cqn; } - in->ctx.flags_xrcd = cpu_to_be32((flgs & 0xFF000000) | (xrcdn & 0xFFFFFF)); - - in->ctx.pd = cpu_to_be32(to_mpd(pd)->pdn); - in->ctx.db_record = cpu_to_be64(srq->db.dma); - err = mlx5_core_create_srq(dev->mdev, &srq->msrq, in, inlen, is_xrc); - kvfree(in); + in.pd = to_mpd(pd)->pdn; + in.db_record = srq->db.dma; + err = mlx5_core_create_srq(dev->mdev, &srq->msrq, &in); + kvfree(in.pas); if (err) { mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err); goto err_usr_kern_srq; @@ -401,7 +379,7 @@ int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); struct mlx5_ib_srq *srq = to_msrq(ibsrq); int ret; - struct mlx5_query_srq_mbox_out *out; + struct mlx5_srq_attr *out; out = kzalloc(sizeof(*out), GFP_KERNEL); if (!out) @@ -411,7 +389,7 @@ int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) if (ret) goto out_box; - srq_attr->srq_limit = be16_to_cpu(out->ctx.lwm); + srq_attr->srq_limit = out->lwm; srq_attr->max_wr = srq->msrq.max - 1; srq_attr->max_sge = srq->msrq.max_gs; @@ -458,6 +436,8 @@ int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, struct mlx5_ib_srq *srq = to_msrq(ibsrq); struct mlx5_wqe_srq_next_seg *next; struct mlx5_wqe_data_seg *scat; + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_core_dev *mdev = dev->mdev; unsigned long flags; int err = 0; int nreq; @@ -465,6 +445,12 @@ int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, spin_lock_irqsave(&srq->lock, flags); + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + err = -EIO; + *bad_wr = wr; + goto out; + } + for (nreq = 0; wr; nreq++, wr = wr->next) { if (unlikely(wr->num_sge > srq->msrq.max_gs)) { err = -EINVAL; @@ -507,7 +493,7 @@ int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, *srq->db.db = cpu_to_be32(srq->wqe_ctr); } - +out: spin_unlock_irqrestore(&srq->lock, flags); return err; diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h index 61bc308bb802..188dac4301b5 100644 --- a/drivers/infiniband/hw/mlx5/user.h +++ b/drivers/infiniband/hw/mlx5/user.h @@ -46,6 +46,10 @@ enum { MLX5_SRQ_FLAG_SIGNATURE = 1 << 0, }; +enum { + MLX5_WQ_FLAG_SIGNATURE = 1 << 0, +}; + /* Increment this value if any changes that break userspace ABI * compatibility are made. @@ -79,6 +83,10 @@ enum mlx5_ib_alloc_ucontext_resp_mask { MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0, }; +enum mlx5_user_cmds_supp_uhw { + MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE = 1 << 0, +}; + struct mlx5_ib_alloc_ucontext_resp { __u32 qp_tab_size; __u32 bf_reg_size; @@ -94,8 +102,8 @@ struct mlx5_ib_alloc_ucontext_resp { __u32 comp_mask; __u32 response_length; __u8 cqe_version; - __u8 reserved2; - __u16 reserved3; + __u8 cmds_supp_uhw; + __u16 reserved2; __u64 hca_core_clock_offset; }; @@ -103,6 +111,22 @@ struct mlx5_ib_alloc_pd_resp { __u32 pdn; }; +struct mlx5_ib_tso_caps { + __u32 max_tso; /* Maximum tso payload size in bytes */ + + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported, e.g. + * supported_qpts |= 1 << IB_QPT_UD + */ + __u32 supported_qpts; +}; + +struct mlx5_ib_query_device_resp { + __u32 comp_mask; + __u32 response_length; + struct mlx5_ib_tso_caps tso_caps; +}; + struct mlx5_ib_create_cq { __u64 buf_addr; __u64 db_addr; @@ -148,6 +172,40 @@ struct mlx5_ib_create_qp { __u64 sq_buf_addr; }; +/* RX Hash function flags */ +enum mlx5_rx_hash_function_flags { + MLX5_RX_HASH_FUNC_TOEPLITZ = 1 << 0, +}; + +/* + * RX Hash flags, these flags allows to set which incoming packet's field should + * participates in RX Hash. Each flag represent certain packet's field, + * when the flag is set the field that is represented by the flag will + * participate in RX Hash calculation. + * Note: *IPV4 and *IPV6 flags can't be enabled together on the same QP + * and *TCP and *UDP flags can't be enabled together on the same QP. +*/ +enum mlx5_rx_hash_fields { + MLX5_RX_HASH_SRC_IPV4 = 1 << 0, + MLX5_RX_HASH_DST_IPV4 = 1 << 1, + MLX5_RX_HASH_SRC_IPV6 = 1 << 2, + MLX5_RX_HASH_DST_IPV6 = 1 << 3, + MLX5_RX_HASH_SRC_PORT_TCP = 1 << 4, + MLX5_RX_HASH_DST_PORT_TCP = 1 << 5, + MLX5_RX_HASH_SRC_PORT_UDP = 1 << 6, + MLX5_RX_HASH_DST_PORT_UDP = 1 << 7 +}; + +struct mlx5_ib_create_qp_rss { + __u64 rx_hash_fields_mask; /* enum mlx5_rx_hash_fields */ + __u8 rx_hash_function; /* enum mlx5_rx_hash_function_flags */ + __u8 rx_key_len; /* valid only for Toeplitz */ + __u8 reserved[6]; + __u8 rx_hash_key[128]; /* valid only for Toeplitz */ + __u32 comp_mask; + __u32 reserved1; +}; + struct mlx5_ib_create_qp_resp { __u32 uuar_index; }; @@ -159,6 +217,32 @@ struct mlx5_ib_alloc_mw { __u16 reserved2; }; +struct mlx5_ib_create_wq { + __u64 buf_addr; + __u64 db_addr; + __u32 rq_wqe_count; + __u32 rq_wqe_shift; + __u32 user_index; + __u32 flags; + __u32 comp_mask; + __u32 reserved; +}; + +struct mlx5_ib_create_wq_resp { + __u32 response_length; + __u32 reserved; +}; + +struct mlx5_ib_create_rwq_ind_tbl_resp { + __u32 response_length; + __u32 reserved; +}; + +struct mlx5_ib_modify_wq { + __u32 comp_mask; + __u32 reserved; +}; + static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext, struct mlx5_ib_create_qp *ucmd, int inlen, diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 9866c35cc977..da2335f7f7c3 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1081,16 +1081,6 @@ static ssize_t show_rev(struct device *device, struct device_attribute *attr, return sprintf(buf, "%x\n", dev->rev_id); } -static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, - char *buf) -{ - struct mthca_dev *dev = - container_of(device, struct mthca_dev, ib_dev.dev); - return sprintf(buf, "%d.%d.%d\n", (int) (dev->fw_ver >> 32), - (int) (dev->fw_ver >> 16) & 0xffff, - (int) dev->fw_ver & 0xffff); -} - static ssize_t show_hca(struct device *device, struct device_attribute *attr, char *buf) { @@ -1120,13 +1110,11 @@ static ssize_t show_board(struct device *device, struct device_attribute *attr, } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static struct device_attribute *mthca_dev_attributes[] = { &dev_attr_hw_rev, - &dev_attr_fw_ver, &dev_attr_hca_type, &dev_attr_board_id }; @@ -1187,6 +1175,17 @@ static int mthca_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } +static void get_dev_fw_str(struct ib_device *device, char *str, + size_t str_len) +{ + struct mthca_dev *dev = + container_of(device, struct mthca_dev, ib_dev); + snprintf(str, str_len, "%d.%d.%d", + (int) (dev->fw_ver >> 32), + (int) (dev->fw_ver >> 16) & 0xffff, + (int) dev->fw_ver & 0xffff); +} + int mthca_register_device(struct mthca_dev *dev) { int ret; @@ -1266,6 +1265,7 @@ int mthca_register_device(struct mthca_dev *dev) dev->ib_dev.reg_user_mr = mthca_reg_user_mr; dev->ib_dev.dereg_mr = mthca_dereg_mr; dev->ib_dev.get_port_immutable = mthca_port_immutable; + dev->ib_dev.get_dev_fw_str = get_dev_fw_str; if (dev->mthca_flags & MTHCA_FLAG_FMR) { dev->ib_dev.alloc_fmr = mthca_alloc_fmr; diff --git a/drivers/infiniband/hw/mthca/mthca_reset.c b/drivers/infiniband/hw/mthca/mthca_reset.c index 74c6a9426047..6727af27c017 100644 --- a/drivers/infiniband/hw/mthca/mthca_reset.c +++ b/drivers/infiniband/hw/mthca/mthca_reset.c @@ -98,7 +98,7 @@ int mthca_reset(struct mthca_dev *mdev) err = -ENOMEM; mthca_err(mdev, "Couldn't allocate memory to save HCA " "PCI header, aborting.\n"); - goto out; + goto put_dev; } for (i = 0; i < 64; ++i) { @@ -108,7 +108,7 @@ int mthca_reset(struct mthca_dev *mdev) err = -ENODEV; mthca_err(mdev, "Couldn't save HCA " "PCI header, aborting.\n"); - goto out; + goto free_hca; } } @@ -121,7 +121,7 @@ int mthca_reset(struct mthca_dev *mdev) err = -ENOMEM; mthca_err(mdev, "Couldn't allocate memory to save HCA " "bridge PCI header, aborting.\n"); - goto out; + goto free_hca; } for (i = 0; i < 64; ++i) { @@ -131,7 +131,7 @@ int mthca_reset(struct mthca_dev *mdev) err = -ENODEV; mthca_err(mdev, "Couldn't save HCA bridge " "PCI header, aborting.\n"); - goto out; + goto free_bh; } } bridge_pcix_cap = pci_find_capability(bridge, PCI_CAP_ID_PCIX); @@ -139,7 +139,7 @@ int mthca_reset(struct mthca_dev *mdev) err = -ENODEV; mthca_err(mdev, "Couldn't locate HCA bridge " "PCI-X capability, aborting.\n"); - goto out; + goto free_bh; } } @@ -152,7 +152,7 @@ int mthca_reset(struct mthca_dev *mdev) err = -ENOMEM; mthca_err(mdev, "Couldn't map HCA reset register, " "aborting.\n"); - goto out; + goto free_bh; } writel(MTHCA_RESET_VALUE, reset); @@ -172,7 +172,7 @@ int mthca_reset(struct mthca_dev *mdev) err = -ENODEV; mthca_err(mdev, "Couldn't access HCA after reset, " "aborting.\n"); - goto out; + goto free_bh; } if (v != 0xffffffff) @@ -184,7 +184,7 @@ int mthca_reset(struct mthca_dev *mdev) err = -ENODEV; mthca_err(mdev, "PCI device did not come back after reset, " "aborting.\n"); - goto out; + goto free_bh; } good: @@ -195,14 +195,14 @@ good: err = -ENODEV; mthca_err(mdev, "Couldn't restore HCA bridge Upstream " "split transaction control, aborting.\n"); - goto out; + goto free_bh; } if (pci_write_config_dword(bridge, bridge_pcix_cap + 0xc, bridge_header[(bridge_pcix_cap + 0xc) / 4])) { err = -ENODEV; mthca_err(mdev, "Couldn't restore HCA bridge Downstream " "split transaction control, aborting.\n"); - goto out; + goto free_bh; } /* * Bridge control register is at 0x3e, so we'll @@ -216,7 +216,7 @@ good: err = -ENODEV; mthca_err(mdev, "Couldn't restore HCA bridge reg %x, " "aborting.\n", i); - goto out; + goto free_bh; } } @@ -225,7 +225,7 @@ good: err = -ENODEV; mthca_err(mdev, "Couldn't restore HCA bridge COMMAND, " "aborting.\n"); - goto out; + goto free_bh; } } @@ -235,7 +235,7 @@ good: err = -ENODEV; mthca_err(mdev, "Couldn't restore HCA PCI-X " "command register, aborting.\n"); - goto out; + goto free_bh; } } @@ -246,7 +246,7 @@ good: err = -ENODEV; mthca_err(mdev, "Couldn't restore HCA PCI Express " "Device Control register, aborting.\n"); - goto out; + goto free_bh; } linkctl = hca_header[(hca_pcie_cap + PCI_EXP_LNKCTL) / 4]; if (pcie_capability_write_word(mdev->pdev, PCI_EXP_LNKCTL, @@ -254,7 +254,7 @@ good: err = -ENODEV; mthca_err(mdev, "Couldn't restore HCA PCI Express " "Link control register, aborting.\n"); - goto out; + goto free_bh; } } @@ -266,7 +266,7 @@ good: err = -ENODEV; mthca_err(mdev, "Couldn't restore HCA reg %x, " "aborting.\n", i); - goto out; + goto free_bh; } } @@ -275,14 +275,12 @@ good: err = -ENODEV; mthca_err(mdev, "Couldn't restore HCA COMMAND, " "aborting.\n"); - goto out; } - -out: - if (bridge) - pci_dev_put(bridge); +free_bh: kfree(bridge_header); +free_hca: kfree(hca_header); - +put_dev: + pci_dev_put(bridge); return err; } diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 464d6da5fe91..bd69125731c1 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -2606,23 +2606,6 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr, /** - * show_fw_ver - */ -static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct nes_ib_device *nesibdev = - container_of(dev, struct nes_ib_device, ibdev.dev); - struct nes_vnic *nesvnic = nesibdev->nesvnic; - - nes_debug(NES_DBG_INIT, "\n"); - return sprintf(buf, "%u.%u\n", - (nesvnic->nesdev->nesadapter->firmware_version >> 16), - (nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff)); -} - - -/** * show_hca */ static ssize_t show_hca(struct device *dev, struct device_attribute *attr, @@ -2645,13 +2628,11 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static struct device_attribute *nes_dev_attributes[] = { &dev_attr_hw_rev, - &dev_attr_fw_ver, &dev_attr_hca_type, &dev_attr_board_id }; @@ -3703,6 +3684,19 @@ static int nes_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } +static void get_dev_fw_str(struct ib_device *dev, char *str, + size_t str_len) +{ + struct nes_ib_device *nesibdev = + container_of(dev, struct nes_ib_device, ibdev); + struct nes_vnic *nesvnic = nesibdev->nesvnic; + + nes_debug(NES_DBG_INIT, "\n"); + snprintf(str, str_len, "%u.%u", + (nesvnic->nesdev->nesadapter->firmware_version >> 16), + (nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff)); +} + /** * nes_init_ofa_device */ @@ -3802,6 +3796,7 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) nesibdev->ibdev.iwcm->create_listen = nes_create_listen; nesibdev->ibdev.iwcm->destroy_listen = nes_destroy_listen; nesibdev->ibdev.get_port_immutable = nes_port_immutable; + nesibdev->ibdev.get_dev_fw_str = get_dev_fw_str; memcpy(nesibdev->ibdev.iwcm->ifname, netdev->name, sizeof(nesibdev->ibdev.iwcm->ifname)); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 3d75f65ce87e..07d0c6c5b046 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -107,6 +107,14 @@ static int ocrdma_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } +static void get_dev_fw_str(struct ib_device *device, char *str, + size_t str_len) +{ + struct ocrdma_dev *dev = get_ocrdma_dev(device); + + snprintf(str, str_len, "%s", &dev->attr.fw_ver[0]); +} + static int ocrdma_register_device(struct ocrdma_dev *dev) { strlcpy(dev->ibdev.name, "ocrdma%d", IB_DEVICE_NAME_MAX); @@ -193,6 +201,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) dev->ibdev.process_mad = ocrdma_process_mad; dev->ibdev.get_port_immutable = ocrdma_port_immutable; + dev->ibdev.get_dev_fw_str = get_dev_fw_str; if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { dev->ibdev.uverbs_cmd_mask |= @@ -262,14 +271,6 @@ static ssize_t show_rev(struct device *device, struct device_attribute *attr, return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor); } -static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, - char *buf) -{ - struct ocrdma_dev *dev = dev_get_drvdata(device); - - return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->attr.fw_ver[0]); -} - static ssize_t show_hca_type(struct device *device, struct device_attribute *attr, char *buf) { @@ -279,12 +280,10 @@ static ssize_t show_hca_type(struct device *device, } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL); static struct device_attribute *ocrdma_attributes[] = { &dev_attr_hw_rev, - &dev_attr_fw_ver, &dev_attr_hca_type }; diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 575b737d9ef3..9cc0aae1d781 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -106,6 +106,49 @@ static u32 credit_table[31] = { 32768 /* 1E */ }; +const struct rvt_operation_params qib_post_parms[RVT_OPERATION_MAX] = { +[IB_WR_RDMA_WRITE] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +[IB_WR_RDMA_READ] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_ATOMIC, +}, + +[IB_WR_ATOMIC_CMP_AND_SWP] = { + .length = sizeof(struct ib_atomic_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE, +}, + +[IB_WR_ATOMIC_FETCH_AND_ADD] = { + .length = sizeof(struct ib_atomic_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE, +}, + +[IB_WR_RDMA_WRITE_WITH_IMM] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +[IB_WR_SEND] = { + .length = sizeof(struct ib_send_wr), + .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) | + BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +[IB_WR_SEND_WITH_IMM] = { + .length = sizeof(struct ib_send_wr), + .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) | + BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +}; + static void get_map_page(struct rvt_qpn_table *qpt, struct rvt_qpn_map *map, gfp_t gfp) { diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index 846e6c726df7..10d062561bd9 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -169,8 +169,12 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) } if (ah_attr->ah_flags & IB_AH_GRH) { - qib_copy_sge(&qp->r_sge, &ah_attr->grh, - sizeof(struct ib_grh), 1); + struct ib_grh grh; + struct ib_global_route grd = ah_attr->grh; + + qib_make_grh(ibp, &grh, &grd, 0, 0); + qib_copy_sge(&qp->r_sge, &grh, + sizeof(grh), 1); wc.wc_flags |= IB_WC_GRH; } else qib_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1); diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index cbf6200e6afc..fd1dfbce5539 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1582,6 +1582,8 @@ static void qib_fill_device_attr(struct qib_devdata *dd) rdi->dparms.props.max_total_mcast_qp_attach = rdi->dparms.props.max_mcast_qp_attach * rdi->dparms.props.max_mcast_grp; + /* post send table */ + dd->verbs_dev.rdi.post_parms = qib_post_parms; } /** diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 4f878151f81f..736ced684842 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -497,4 +497,6 @@ extern unsigned int ib_qib_max_srq_wrs; extern const u32 ib_qib_rnr_table[]; +extern const struct rvt_operation_params qib_post_parms[]; + #endif /* QIB_VERBS_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index 565c881a44ba..c229b9f4a52d 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -331,6 +331,21 @@ static int usnic_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } +static void usnic_get_dev_fw_str(struct ib_device *device, + char *str, + size_t str_len) +{ + struct usnic_ib_dev *us_ibdev = + container_of(device, struct usnic_ib_dev, ib_dev); + struct ethtool_drvinfo info; + + mutex_lock(&us_ibdev->usdev_lock); + us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info); + mutex_unlock(&us_ibdev->usdev_lock); + + snprintf(str, str_len, "%s", info.fw_version); +} + /* Start of PF discovery section */ static void *usnic_ib_device_add(struct pci_dev *dev) { @@ -414,6 +429,7 @@ static void *usnic_ib_device_add(struct pci_dev *dev) us_ibdev->ib_dev.req_notify_cq = usnic_ib_req_notify_cq; us_ibdev->ib_dev.get_dma_mr = usnic_ib_get_dma_mr; us_ibdev->ib_dev.get_port_immutable = usnic_port_immutable; + us_ibdev->ib_dev.get_dev_fw_str = usnic_get_dev_fw_str; if (ib_register_device(&us_ibdev->ib_dev, NULL)) diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c index 3412ea06116e..80ef3f8998c8 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c @@ -45,21 +45,6 @@ #include "usnic_ib_verbs.h" #include "usnic_log.h" -static ssize_t usnic_ib_show_fw_ver(struct device *device, - struct device_attribute *attr, - char *buf) -{ - struct usnic_ib_dev *us_ibdev = - container_of(device, struct usnic_ib_dev, ib_dev.dev); - struct ethtool_drvinfo info; - - mutex_lock(&us_ibdev->usdev_lock); - us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info); - mutex_unlock(&us_ibdev->usdev_lock); - - return scnprintf(buf, PAGE_SIZE, "%s\n", info.fw_version); -} - static ssize_t usnic_ib_show_board(struct device *device, struct device_attribute *attr, char *buf) @@ -192,7 +177,6 @@ usnic_ib_show_cq_per_vf(struct device *device, struct device_attribute *attr, us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]); } -static DEVICE_ATTR(fw_ver, S_IRUGO, usnic_ib_show_fw_ver, NULL); static DEVICE_ATTR(board_id, S_IRUGO, usnic_ib_show_board, NULL); static DEVICE_ATTR(config, S_IRUGO, usnic_ib_show_config, NULL); static DEVICE_ATTR(iface, S_IRUGO, usnic_ib_show_iface, NULL); @@ -201,7 +185,6 @@ static DEVICE_ATTR(qp_per_vf, S_IRUGO, usnic_ib_show_qp_per_vf, NULL); static DEVICE_ATTR(cq_per_vf, S_IRUGO, usnic_ib_show_cq_per_vf, NULL); static struct device_attribute *usnic_class_attributes[] = { - &dev_attr_fw_ver, &dev_attr_board_id, &dev_attr_config, &dev_attr_iface, diff --git a/drivers/infiniband/sw/Makefile b/drivers/infiniband/sw/Makefile index 988b6a0101a4..8b095b27db87 100644 --- a/drivers/infiniband/sw/Makefile +++ b/drivers/infiniband/sw/Makefile @@ -1 +1,2 @@ obj-$(CONFIG_INFINIBAND_RDMAVT) += rdmavt/ +obj-$(CONFIG_RDMA_RXE) += rxe/ diff --git a/drivers/infiniband/sw/rdmavt/Kconfig b/drivers/infiniband/sw/rdmavt/Kconfig index 11aa6a34bd71..1da8d01a6855 100644 --- a/drivers/infiniband/sw/rdmavt/Kconfig +++ b/drivers/infiniband/sw/rdmavt/Kconfig @@ -1,6 +1,5 @@ config INFINIBAND_RDMAVT tristate "RDMA verbs transport library" depends on 64BIT - default m ---help--- This is a common software verbs provider for RDMA networks. diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 6ca6fa80dd6e..f2f229efbe64 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -510,6 +510,7 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi) if (rdi->worker) return 0; + spin_lock_init(&rdi->n_cqs_lock); rdi->worker = kzalloc(sizeof(*rdi->worker), GFP_KERNEL); if (!rdi->worker) return -ENOMEM; diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 0f4d4500f45e..80c4b6b401b8 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -140,6 +140,7 @@ static int rvt_init_mregion(struct rvt_mregion *mr, struct ib_pd *pd, init_completion(&mr->comp); /* count returning the ptr to user */ atomic_set(&mr->refcount, 1); + atomic_set(&mr->lkey_invalid, 0); mr->pd = pd; mr->max_segs = count; return 0; @@ -480,6 +481,123 @@ struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, } /** + * rvt_set_page - page assignment function called by ib_sg_to_pages + * @ibmr: memory region + * @addr: dma address of mapped page + * + * Return: 0 on success + */ +static int rvt_set_page(struct ib_mr *ibmr, u64 addr) +{ + struct rvt_mr *mr = to_imr(ibmr); + u32 ps = 1 << mr->mr.page_shift; + u32 mapped_segs = mr->mr.length >> mr->mr.page_shift; + int m, n; + + if (unlikely(mapped_segs == mr->mr.max_segs)) + return -ENOMEM; + + if (mr->mr.length == 0) { + mr->mr.user_base = addr; + mr->mr.iova = addr; + } + + m = mapped_segs / RVT_SEGSZ; + n = mapped_segs % RVT_SEGSZ; + mr->mr.map[m]->segs[n].vaddr = (void *)addr; + mr->mr.map[m]->segs[n].length = ps; + mr->mr.length += ps; + + return 0; +} + +/** + * rvt_map_mr_sg - map sg list and set it the memory region + * @ibmr: memory region + * @sg: dma mapped scatterlist + * @sg_nents: number of entries in sg + * @sg_offset: offset in bytes into sg + * + * Return: number of sg elements mapped to the memory region + */ +int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, + int sg_nents, unsigned int *sg_offset) +{ + struct rvt_mr *mr = to_imr(ibmr); + + mr->mr.length = 0; + mr->mr.page_shift = PAGE_SHIFT; + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, + rvt_set_page); +} + +/** + * rvt_fast_reg_mr - fast register physical MR + * @qp: the queue pair where the work request comes from + * @ibmr: the memory region to be registered + * @key: updated key for this memory region + * @access: access flags for this memory region + * + * Returns 0 on success. + */ +int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key, + int access) +{ + struct rvt_mr *mr = to_imr(ibmr); + + if (qp->ibqp.pd != mr->mr.pd) + return -EACCES; + + /* not applicable to dma MR or user MR */ + if (!mr->mr.lkey || mr->umem) + return -EINVAL; + + if ((key & 0xFFFFFF00) != (mr->mr.lkey & 0xFFFFFF00)) + return -EINVAL; + + ibmr->lkey = key; + ibmr->rkey = key; + mr->mr.lkey = key; + mr->mr.access_flags = access; + atomic_set(&mr->mr.lkey_invalid, 0); + + return 0; +} +EXPORT_SYMBOL(rvt_fast_reg_mr); + +/** + * rvt_invalidate_rkey - invalidate an MR rkey + * @qp: queue pair associated with the invalidate op + * @rkey: rkey to invalidate + * + * Returns 0 on success. + */ +int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey) +{ + struct rvt_dev_info *dev = ib_to_rvt(qp->ibqp.device); + struct rvt_lkey_table *rkt = &dev->lkey_table; + struct rvt_mregion *mr; + + if (rkey == 0) + return -EINVAL; + + rcu_read_lock(); + mr = rcu_dereference( + rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]); + if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd)) + goto bail; + + atomic_set(&mr->lkey_invalid, 1); + rcu_read_unlock(); + return 0; + +bail: + rcu_read_unlock(); + return -EINVAL; +} +EXPORT_SYMBOL(rvt_invalidate_rkey); + +/** * rvt_alloc_fmr - allocate a fast memory region * @pd: the protection domain for this memory region * @mr_access_flags: access flags for this memory region @@ -682,7 +800,8 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, } mr = rcu_dereference( rkt->table[(sge->lkey >> (32 - dev->dparms.lkey_table_size))]); - if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd)) + if (unlikely(!mr || atomic_read(&mr->lkey_invalid) || + mr->lkey != sge->lkey || mr->pd != &pd->ibpd)) goto bail; off = sge->addr - mr->user_base; @@ -782,7 +901,8 @@ int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge, mr = rcu_dereference( rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]); - if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd)) + if (unlikely(!mr || atomic_read(&mr->lkey_invalid) || + mr->lkey != rkey || qp->ibqp.pd != mr->pd)) goto bail; off = vaddr - mr->iova; diff --git a/drivers/infiniband/sw/rdmavt/mr.h b/drivers/infiniband/sw/rdmavt/mr.h index 69380512c6d1..132800ee0205 100644 --- a/drivers/infiniband/sw/rdmavt/mr.h +++ b/drivers/infiniband/sw/rdmavt/mr.h @@ -82,6 +82,8 @@ int rvt_dereg_mr(struct ib_mr *ibmr); struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); +int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, + int sg_nents, unsigned int *sg_offset); struct ib_fmr *rvt_alloc_fmr(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr); int rvt_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 41ba7e9cadaa..bdb540f25a88 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -435,8 +435,7 @@ static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends) for (n = 0; n < rvt_max_atomic(rdi); n++) { struct rvt_ack_entry *e = &qp->s_ack_queue[n]; - if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST && - e->rdma_sge.mr) { + if (e->rdma_sge.mr) { rvt_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } @@ -584,6 +583,7 @@ static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, qp->r_rq.wq->tail = 0; } qp->r_sge.num_sge = 0; + atomic_set(&qp->s_reserved_used, 0); } /** @@ -613,6 +613,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device); void *priv = NULL; gfp_t gfp; + size_t sqsize; if (!rdi) return ERR_PTR(-EINVAL); @@ -643,7 +644,9 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, init_attr->cap.max_recv_wr == 0) return ERR_PTR(-EINVAL); } - + sqsize = + init_attr->cap.max_send_wr + 1 + + rdi->dparms.reserved_operations; switch (init_attr->qp_type) { case IB_QPT_SMI: case IB_QPT_GSI: @@ -658,11 +661,11 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, sizeof(struct rvt_swqe); if (gfp == GFP_NOIO) swq = __vmalloc( - (init_attr->cap.max_send_wr + 1) * sz, + sqsize * sz, gfp | __GFP_ZERO, PAGE_KERNEL); else swq = vzalloc_node( - (init_attr->cap.max_send_wr + 1) * sz, + sqsize * sz, rdi->dparms.node); if (!swq) return ERR_PTR(-ENOMEM); @@ -741,13 +744,14 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, spin_lock_init(&qp->s_lock); spin_lock_init(&qp->r_rq.lock); atomic_set(&qp->refcount, 0); + atomic_set(&qp->local_ops_pending, 0); init_waitqueue_head(&qp->wait); init_timer(&qp->s_timer); qp->s_timer.data = (unsigned long)qp; INIT_LIST_HEAD(&qp->rspwait); qp->state = IB_QPS_RESET; qp->s_wq = swq; - qp->s_size = init_attr->cap.max_send_wr + 1; + qp->s_size = sqsize; qp->s_avail = init_attr->cap.max_send_wr; qp->s_max_sge = init_attr->cap.max_send_sge; if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) @@ -1332,7 +1336,8 @@ int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, attr->sq_psn = qp->s_next_psn & rdi->dparms.psn_mask; attr->dest_qp_num = qp->remote_qpn; attr->qp_access_flags = qp->qp_access_flags; - attr->cap.max_send_wr = qp->s_size - 1; + attr->cap.max_send_wr = qp->s_size - 1 - + rdi->dparms.reserved_operations; attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1; attr->cap.max_send_sge = qp->s_max_sge; attr->cap.max_recv_sge = qp->r_rq.max_sge; @@ -1440,25 +1445,116 @@ int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, } /** - * qp_get_savail - return number of avail send entries + * rvt_qp_valid_operation - validate post send wr request + * @qp - the qp + * @post-parms - the post send table for the driver + * @wr - the work request + * + * The routine validates the operation based on the + * validation table an returns the length of the operation + * which can extend beyond the ib_send_bw. Operation + * dependent flags key atomic operation validation. * + * There is an exception for UD qps that validates the pd and + * overrides the length to include the additional UD specific + * length. + * + * Returns a negative error or the length of the work request + * for building the swqe. + */ +static inline int rvt_qp_valid_operation( + struct rvt_qp *qp, + const struct rvt_operation_params *post_parms, + struct ib_send_wr *wr) +{ + int len; + + if (wr->opcode >= RVT_OPERATION_MAX || !post_parms[wr->opcode].length) + return -EINVAL; + if (!(post_parms[wr->opcode].qpt_support & BIT(qp->ibqp.qp_type))) + return -EINVAL; + if ((post_parms[wr->opcode].flags & RVT_OPERATION_PRIV) && + ibpd_to_rvtpd(qp->ibqp.pd)->user) + return -EINVAL; + if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC_SGE && + (wr->num_sge == 0 || + wr->sg_list[0].length < sizeof(u64) || + wr->sg_list[0].addr & (sizeof(u64) - 1))) + return -EINVAL; + if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC && + !qp->s_max_rd_atomic) + return -EINVAL; + len = post_parms[wr->opcode].length; + /* UD specific */ + if (qp->ibqp.qp_type != IB_QPT_UC && + qp->ibqp.qp_type != IB_QPT_RC) { + if (qp->ibqp.pd != ud_wr(wr)->ah->pd) + return -EINVAL; + len = sizeof(struct ib_ud_wr); + } + return len; +} + +/** + * rvt_qp_is_avail - determine queue capacity * @qp - the qp + * @rdi - the rdmavt device + * @reserved_op - is reserved operation * * This assumes the s_hlock is held but the s_last * qp variable is uncontrolled. + * + * For non reserved operations, the qp->s_avail + * may be changed. + * + * The return value is zero or a -ENOMEM. */ -static inline u32 qp_get_savail(struct rvt_qp *qp) +static inline int rvt_qp_is_avail( + struct rvt_qp *qp, + struct rvt_dev_info *rdi, + bool reserved_op) { u32 slast; - u32 ret; - + u32 avail; + u32 reserved_used; + + /* see rvt_qp_wqe_unreserve() */ + smp_mb__before_atomic(); + reserved_used = atomic_read(&qp->s_reserved_used); + if (unlikely(reserved_op)) { + /* see rvt_qp_wqe_unreserve() */ + smp_mb__before_atomic(); + if (reserved_used >= rdi->dparms.reserved_operations) + return -ENOMEM; + return 0; + } + /* non-reserved operations */ + if (likely(qp->s_avail)) + return 0; smp_read_barrier_depends(); /* see rc.c */ slast = ACCESS_ONCE(qp->s_last); if (qp->s_head >= slast) - ret = qp->s_size - (qp->s_head - slast); + avail = qp->s_size - (qp->s_head - slast); else - ret = slast - qp->s_head; - return ret - 1; + avail = slast - qp->s_head; + + /* see rvt_qp_wqe_unreserve() */ + smp_mb__before_atomic(); + reserved_used = atomic_read(&qp->s_reserved_used); + avail = avail - 1 - + (rdi->dparms.reserved_operations - reserved_used); + /* insure we don't assign a negative s_avail */ + if ((s32)avail <= 0) + return -ENOMEM; + qp->s_avail = avail; + if (WARN_ON(qp->s_avail > + (qp->s_size - 1 - rdi->dparms.reserved_operations))) + rvt_pr_err(rdi, + "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u", + qp->ibqp.qp_num, qp->s_size, qp->s_avail, + qp->s_head, qp->s_tail, qp->s_cur, + qp->s_acked, qp->s_last); + return 0; } /** @@ -1480,49 +1576,64 @@ static int rvt_post_one_wr(struct rvt_qp *qp, struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); u8 log_pmtu; int ret; + size_t cplen; + bool reserved_op; + int local_ops_delayed = 0; + + BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE)); /* IB spec says that num_sge == 0 is OK. */ if (unlikely(wr->num_sge > qp->s_max_sge)) return -EINVAL; + ret = rvt_qp_valid_operation(qp, rdi->post_parms, wr); + if (ret < 0) + return ret; + cplen = ret; + /* - * Don't allow RDMA reads or atomic operations on UC or - * undefined operations. - * Make sure buffer is large enough to hold the result for atomics. + * Local operations include fast register and local invalidate. + * Fast register needs to be processed immediately because the + * registered lkey may be used by following work requests and the + * lkey needs to be valid at the time those requests are posted. + * Local invalidate can be processed immediately if fencing is + * not required and no previous local invalidate ops are pending. + * Signaled local operations that have been processed immediately + * need to have requests with "completion only" flags set posted + * to the send queue in order to generate completions. */ - if (qp->ibqp.qp_type == IB_QPT_UC) { - if ((unsigned)wr->opcode >= IB_WR_RDMA_READ) - return -EINVAL; - } else if (qp->ibqp.qp_type != IB_QPT_RC) { - /* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */ - if (wr->opcode != IB_WR_SEND && - wr->opcode != IB_WR_SEND_WITH_IMM) - return -EINVAL; - /* Check UD destination address PD */ - if (qp->ibqp.pd != ud_wr(wr)->ah->pd) + if ((rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL)) { + switch (wr->opcode) { + case IB_WR_REG_MR: + ret = rvt_fast_reg_mr(qp, + reg_wr(wr)->mr, + reg_wr(wr)->key, + reg_wr(wr)->access); + if (ret || !(wr->send_flags & IB_SEND_SIGNALED)) + return ret; + break; + case IB_WR_LOCAL_INV: + if ((wr->send_flags & IB_SEND_FENCE) || + atomic_read(&qp->local_ops_pending)) { + local_ops_delayed = 1; + } else { + ret = rvt_invalidate_rkey( + qp, wr->ex.invalidate_rkey); + if (ret || !(wr->send_flags & IB_SEND_SIGNALED)) + return ret; + } + break; + default: return -EINVAL; - } else if ((unsigned)wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) { - return -EINVAL; - } else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP && - (wr->num_sge == 0 || - wr->sg_list[0].length < sizeof(u64) || - wr->sg_list[0].addr & (sizeof(u64) - 1))) { - return -EINVAL; - } else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) { - return -EINVAL; + } } + + reserved_op = rdi->post_parms[wr->opcode].flags & + RVT_OPERATION_USE_RESERVE; /* check for avail */ - if (unlikely(!qp->s_avail)) { - qp->s_avail = qp_get_savail(qp); - if (WARN_ON(qp->s_avail > (qp->s_size - 1))) - rvt_pr_err(rdi, - "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u", - qp->ibqp.qp_num, qp->s_size, qp->s_avail, - qp->s_head, qp->s_tail, qp->s_cur, - qp->s_acked, qp->s_last); - if (!qp->s_avail) - return -ENOMEM; - } + ret = rvt_qp_is_avail(qp, rdi, reserved_op); + if (ret) + return ret; next = qp->s_head + 1; if (next >= qp->s_size) next = 0; @@ -1531,18 +1642,8 @@ static int rvt_post_one_wr(struct rvt_qp *qp, pd = ibpd_to_rvtpd(qp->ibqp.pd); wqe = rvt_get_swqe_ptr(qp, qp->s_head); - if (qp->ibqp.qp_type != IB_QPT_UC && - qp->ibqp.qp_type != IB_QPT_RC) - memcpy(&wqe->ud_wr, ud_wr(wr), sizeof(wqe->ud_wr)); - else if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM || - wr->opcode == IB_WR_RDMA_WRITE || - wr->opcode == IB_WR_RDMA_READ) - memcpy(&wqe->rdma_wr, rdma_wr(wr), sizeof(wqe->rdma_wr)); - else if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || - wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) - memcpy(&wqe->atomic_wr, atomic_wr(wr), sizeof(wqe->atomic_wr)); - else - memcpy(&wqe->wr, wr, sizeof(wqe->wr)); + /* cplen has length from above */ + memcpy(&wqe->wr, wr, cplen); wqe->length = 0; j = 0; @@ -1585,14 +1686,29 @@ static int rvt_post_one_wr(struct rvt_qp *qp, atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount); } - wqe->ssn = qp->s_ssn++; - wqe->psn = qp->s_next_psn; - wqe->lpsn = wqe->psn + - (wqe->length ? ((wqe->length - 1) >> log_pmtu) : 0); - qp->s_next_psn = wqe->lpsn + 1; + if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) { + if (local_ops_delayed) + atomic_inc(&qp->local_ops_pending); + else + wqe->wr.send_flags |= RVT_SEND_COMPLETION_ONLY; + wqe->ssn = 0; + wqe->psn = 0; + wqe->lpsn = 0; + } else { + wqe->ssn = qp->s_ssn++; + wqe->psn = qp->s_next_psn; + wqe->lpsn = wqe->psn + + (wqe->length ? + ((wqe->length - 1) >> log_pmtu) : + 0); + qp->s_next_psn = wqe->lpsn + 1; + } trace_rvt_post_one_wr(qp, wqe); + if (unlikely(reserved_op)) + rvt_qp_wqe_reserve(qp, wqe); + else + qp->s_avail--; smp_wmb(); /* see request builders */ - qp->s_avail--; qp->s_head = next; return 0; diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 30c4fda7a05a..d430c2f7cec4 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -370,6 +370,7 @@ enum { REG_USER_MR, DEREG_MR, ALLOC_MR, + MAP_MR_SG, ALLOC_FMR, MAP_PHYS_FMR, UNMAP_FMR, @@ -528,7 +529,8 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb) post_send), rvt_post_send)) if (!rdi->driver_f.schedule_send || - !rdi->driver_f.do_send) + !rdi->driver_f.do_send || + !rdi->post_parms) return -EINVAL; break; @@ -633,6 +635,12 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb) rvt_alloc_mr); break; + case MAP_MR_SG: + check_driver_override(rdi, offsetof(struct ib_device, + map_mr_sg), + rvt_map_mr_sg); + break; + case MAP_PHYS_FMR: check_driver_override(rdi, offsetof(struct ib_device, map_phys_fmr), diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig new file mode 100644 index 000000000000..1e4e628fe7b0 --- /dev/null +++ b/drivers/infiniband/sw/rxe/Kconfig @@ -0,0 +1,24 @@ +config RDMA_RXE + tristate "Software RDMA over Ethernet (RoCE) driver" + depends on INET && PCI && INFINIBAND + depends on NET_UDP_TUNNEL + ---help--- + This driver implements the InfiniBand RDMA transport over + the Linux network stack. It enables a system with a + standard Ethernet adapter to interoperate with a RoCE + adapter or with another system running the RXE driver. + Documentation on InfiniBand and RoCE can be downloaded at + www.infinibandta.org and www.openfabrics.org. (See also + siw which is a similar software driver for iWARP.) + + The driver is split into two layers, one interfaces with the + Linux RDMA stack and implements a kernel or user space + verbs API. The user space verbs API requires a support + library named librxe which is loaded by the generic user + space verbs API, libibverbs. The other layer interfaces + with the Linux network stack at layer 3. + + To configure and work with soft-RoCE driver please use the + following wiki page under "configure Soft-RoCE (RXE)" section: + + https://github.com/SoftRoCE/rxe-dev/wiki/rxe-dev:-Home diff --git a/drivers/infiniband/sw/rxe/Makefile b/drivers/infiniband/sw/rxe/Makefile new file mode 100644 index 000000000000..3b3fb9d1c470 --- /dev/null +++ b/drivers/infiniband/sw/rxe/Makefile @@ -0,0 +1,24 @@ +obj-$(CONFIG_RDMA_RXE) += rdma_rxe.o + +rdma_rxe-y := \ + rxe.o \ + rxe_comp.o \ + rxe_req.o \ + rxe_resp.o \ + rxe_recv.o \ + rxe_pool.o \ + rxe_queue.o \ + rxe_verbs.o \ + rxe_av.o \ + rxe_srq.o \ + rxe_qp.o \ + rxe_cq.o \ + rxe_mr.o \ + rxe_dma.o \ + rxe_opcode.o \ + rxe_mmap.o \ + rxe_icrc.o \ + rxe_mcast.o \ + rxe_task.o \ + rxe_net.o \ + rxe_sysfs.o diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c new file mode 100644 index 000000000000..55f0e8f0ca79 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -0,0 +1,386 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +MODULE_AUTHOR("Bob Pearson, Frank Zago, John Groves, Kamal Heib"); +MODULE_DESCRIPTION("Soft RDMA transport"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION("0.2"); + +/* free resources for all ports on a device */ +static void rxe_cleanup_ports(struct rxe_dev *rxe) +{ + kfree(rxe->port.pkey_tbl); + rxe->port.pkey_tbl = NULL; + +} + +/* free resources for a rxe device all objects created for this device must + * have been destroyed + */ +static void rxe_cleanup(struct rxe_dev *rxe) +{ + rxe_pool_cleanup(&rxe->uc_pool); + rxe_pool_cleanup(&rxe->pd_pool); + rxe_pool_cleanup(&rxe->ah_pool); + rxe_pool_cleanup(&rxe->srq_pool); + rxe_pool_cleanup(&rxe->qp_pool); + rxe_pool_cleanup(&rxe->cq_pool); + rxe_pool_cleanup(&rxe->mr_pool); + rxe_pool_cleanup(&rxe->mw_pool); + rxe_pool_cleanup(&rxe->mc_grp_pool); + rxe_pool_cleanup(&rxe->mc_elem_pool); + + rxe_cleanup_ports(rxe); +} + +/* called when all references have been dropped */ +void rxe_release(struct kref *kref) +{ + struct rxe_dev *rxe = container_of(kref, struct rxe_dev, ref_cnt); + + rxe_cleanup(rxe); + ib_dealloc_device(&rxe->ib_dev); +} + +void rxe_dev_put(struct rxe_dev *rxe) +{ + kref_put(&rxe->ref_cnt, rxe_release); +} +EXPORT_SYMBOL_GPL(rxe_dev_put); + +/* initialize rxe device parameters */ +static int rxe_init_device_param(struct rxe_dev *rxe) +{ + rxe->max_inline_data = RXE_MAX_INLINE_DATA; + + rxe->attr.fw_ver = RXE_FW_VER; + rxe->attr.max_mr_size = RXE_MAX_MR_SIZE; + rxe->attr.page_size_cap = RXE_PAGE_SIZE_CAP; + rxe->attr.vendor_id = RXE_VENDOR_ID; + rxe->attr.vendor_part_id = RXE_VENDOR_PART_ID; + rxe->attr.hw_ver = RXE_HW_VER; + rxe->attr.max_qp = RXE_MAX_QP; + rxe->attr.max_qp_wr = RXE_MAX_QP_WR; + rxe->attr.device_cap_flags = RXE_DEVICE_CAP_FLAGS; + rxe->attr.max_sge = RXE_MAX_SGE; + rxe->attr.max_sge_rd = RXE_MAX_SGE_RD; + rxe->attr.max_cq = RXE_MAX_CQ; + rxe->attr.max_cqe = (1 << RXE_MAX_LOG_CQE) - 1; + rxe->attr.max_mr = RXE_MAX_MR; + rxe->attr.max_pd = RXE_MAX_PD; + rxe->attr.max_qp_rd_atom = RXE_MAX_QP_RD_ATOM; + rxe->attr.max_ee_rd_atom = RXE_MAX_EE_RD_ATOM; + rxe->attr.max_res_rd_atom = RXE_MAX_RES_RD_ATOM; + rxe->attr.max_qp_init_rd_atom = RXE_MAX_QP_INIT_RD_ATOM; + rxe->attr.max_ee_init_rd_atom = RXE_MAX_EE_INIT_RD_ATOM; + rxe->attr.atomic_cap = RXE_ATOMIC_CAP; + rxe->attr.max_ee = RXE_MAX_EE; + rxe->attr.max_rdd = RXE_MAX_RDD; + rxe->attr.max_mw = RXE_MAX_MW; + rxe->attr.max_raw_ipv6_qp = RXE_MAX_RAW_IPV6_QP; + rxe->attr.max_raw_ethy_qp = RXE_MAX_RAW_ETHY_QP; + rxe->attr.max_mcast_grp = RXE_MAX_MCAST_GRP; + rxe->attr.max_mcast_qp_attach = RXE_MAX_MCAST_QP_ATTACH; + rxe->attr.max_total_mcast_qp_attach = RXE_MAX_TOT_MCAST_QP_ATTACH; + rxe->attr.max_ah = RXE_MAX_AH; + rxe->attr.max_fmr = RXE_MAX_FMR; + rxe->attr.max_map_per_fmr = RXE_MAX_MAP_PER_FMR; + rxe->attr.max_srq = RXE_MAX_SRQ; + rxe->attr.max_srq_wr = RXE_MAX_SRQ_WR; + rxe->attr.max_srq_sge = RXE_MAX_SRQ_SGE; + rxe->attr.max_fast_reg_page_list_len = RXE_MAX_FMR_PAGE_LIST_LEN; + rxe->attr.max_pkeys = RXE_MAX_PKEYS; + rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY; + + rxe->max_ucontext = RXE_MAX_UCONTEXT; + + return 0; +} + +/* initialize port attributes */ +static int rxe_init_port_param(struct rxe_port *port) +{ + port->attr.state = RXE_PORT_STATE; + port->attr.max_mtu = RXE_PORT_MAX_MTU; + port->attr.active_mtu = RXE_PORT_ACTIVE_MTU; + port->attr.gid_tbl_len = RXE_PORT_GID_TBL_LEN; + port->attr.port_cap_flags = RXE_PORT_PORT_CAP_FLAGS; + port->attr.max_msg_sz = RXE_PORT_MAX_MSG_SZ; + port->attr.bad_pkey_cntr = RXE_PORT_BAD_PKEY_CNTR; + port->attr.qkey_viol_cntr = RXE_PORT_QKEY_VIOL_CNTR; + port->attr.pkey_tbl_len = RXE_PORT_PKEY_TBL_LEN; + port->attr.lid = RXE_PORT_LID; + port->attr.sm_lid = RXE_PORT_SM_LID; + port->attr.lmc = RXE_PORT_LMC; + port->attr.max_vl_num = RXE_PORT_MAX_VL_NUM; + port->attr.sm_sl = RXE_PORT_SM_SL; + port->attr.subnet_timeout = RXE_PORT_SUBNET_TIMEOUT; + port->attr.init_type_reply = RXE_PORT_INIT_TYPE_REPLY; + port->attr.active_width = RXE_PORT_ACTIVE_WIDTH; + port->attr.active_speed = RXE_PORT_ACTIVE_SPEED; + port->attr.phys_state = RXE_PORT_PHYS_STATE; + port->mtu_cap = + ib_mtu_enum_to_int(RXE_PORT_ACTIVE_MTU); + port->subnet_prefix = cpu_to_be64(RXE_PORT_SUBNET_PREFIX); + + return 0; +} + +/* initialize port state, note IB convention that HCA ports are always + * numbered from 1 + */ +static int rxe_init_ports(struct rxe_dev *rxe) +{ + struct rxe_port *port = &rxe->port; + + rxe_init_port_param(port); + + if (!port->attr.pkey_tbl_len || !port->attr.gid_tbl_len) + return -EINVAL; + + port->pkey_tbl = kcalloc(port->attr.pkey_tbl_len, + sizeof(*port->pkey_tbl), GFP_KERNEL); + + if (!port->pkey_tbl) + return -ENOMEM; + + port->pkey_tbl[0] = 0xffff; + port->port_guid = rxe->ifc_ops->port_guid(rxe); + + spin_lock_init(&port->port_lock); + + return 0; +} + +/* init pools of managed objects */ +static int rxe_init_pools(struct rxe_dev *rxe) +{ + int err; + + err = rxe_pool_init(rxe, &rxe->uc_pool, RXE_TYPE_UC, + rxe->max_ucontext); + if (err) + goto err1; + + err = rxe_pool_init(rxe, &rxe->pd_pool, RXE_TYPE_PD, + rxe->attr.max_pd); + if (err) + goto err2; + + err = rxe_pool_init(rxe, &rxe->ah_pool, RXE_TYPE_AH, + rxe->attr.max_ah); + if (err) + goto err3; + + err = rxe_pool_init(rxe, &rxe->srq_pool, RXE_TYPE_SRQ, + rxe->attr.max_srq); + if (err) + goto err4; + + err = rxe_pool_init(rxe, &rxe->qp_pool, RXE_TYPE_QP, + rxe->attr.max_qp); + if (err) + goto err5; + + err = rxe_pool_init(rxe, &rxe->cq_pool, RXE_TYPE_CQ, + rxe->attr.max_cq); + if (err) + goto err6; + + err = rxe_pool_init(rxe, &rxe->mr_pool, RXE_TYPE_MR, + rxe->attr.max_mr); + if (err) + goto err7; + + err = rxe_pool_init(rxe, &rxe->mw_pool, RXE_TYPE_MW, + rxe->attr.max_mw); + if (err) + goto err8; + + err = rxe_pool_init(rxe, &rxe->mc_grp_pool, RXE_TYPE_MC_GRP, + rxe->attr.max_mcast_grp); + if (err) + goto err9; + + err = rxe_pool_init(rxe, &rxe->mc_elem_pool, RXE_TYPE_MC_ELEM, + rxe->attr.max_total_mcast_qp_attach); + if (err) + goto err10; + + return 0; + +err10: + rxe_pool_cleanup(&rxe->mc_grp_pool); +err9: + rxe_pool_cleanup(&rxe->mw_pool); +err8: + rxe_pool_cleanup(&rxe->mr_pool); +err7: + rxe_pool_cleanup(&rxe->cq_pool); +err6: + rxe_pool_cleanup(&rxe->qp_pool); +err5: + rxe_pool_cleanup(&rxe->srq_pool); +err4: + rxe_pool_cleanup(&rxe->ah_pool); +err3: + rxe_pool_cleanup(&rxe->pd_pool); +err2: + rxe_pool_cleanup(&rxe->uc_pool); +err1: + return err; +} + +/* initialize rxe device state */ +static int rxe_init(struct rxe_dev *rxe) +{ + int err; + + /* init default device parameters */ + rxe_init_device_param(rxe); + + err = rxe_init_ports(rxe); + if (err) + goto err1; + + err = rxe_init_pools(rxe); + if (err) + goto err2; + + /* init pending mmap list */ + spin_lock_init(&rxe->mmap_offset_lock); + spin_lock_init(&rxe->pending_lock); + INIT_LIST_HEAD(&rxe->pending_mmaps); + INIT_LIST_HEAD(&rxe->list); + + mutex_init(&rxe->usdev_lock); + + return 0; + +err2: + rxe_cleanup_ports(rxe); +err1: + return err; +} + +int rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) +{ + struct rxe_port *port = &rxe->port; + enum ib_mtu mtu; + + mtu = eth_mtu_int_to_enum(ndev_mtu); + + /* Make sure that new MTU in range */ + mtu = mtu ? min_t(enum ib_mtu, mtu, RXE_PORT_MAX_MTU) : IB_MTU_256; + + port->attr.active_mtu = mtu; + port->mtu_cap = ib_mtu_enum_to_int(mtu); + + return 0; +} +EXPORT_SYMBOL(rxe_set_mtu); + +/* called by ifc layer to create new rxe device. + * The caller should allocate memory for rxe by calling ib_alloc_device. + */ +int rxe_add(struct rxe_dev *rxe, unsigned int mtu) +{ + int err; + + kref_init(&rxe->ref_cnt); + + err = rxe_init(rxe); + if (err) + goto err1; + + err = rxe_set_mtu(rxe, mtu); + if (err) + goto err1; + + err = rxe_register_device(rxe); + if (err) + goto err1; + + return 0; + +err1: + rxe_dev_put(rxe); + return err; +} +EXPORT_SYMBOL(rxe_add); + +/* called by the ifc layer to remove a device */ +void rxe_remove(struct rxe_dev *rxe) +{ + rxe_unregister_device(rxe); + + rxe_dev_put(rxe); +} +EXPORT_SYMBOL(rxe_remove); + +static int __init rxe_module_init(void) +{ + int err; + + /* initialize slab caches for managed objects */ + err = rxe_cache_init(); + if (err) { + pr_err("rxe: unable to init object pools\n"); + return err; + } + + err = rxe_net_init(); + if (err) { + pr_err("rxe: unable to init\n"); + rxe_cache_exit(); + return err; + } + pr_info("rxe: loaded\n"); + + return 0; +} + +static void __exit rxe_module_exit(void) +{ + rxe_remove_all(); + rxe_net_exit(); + rxe_cache_exit(); + + pr_info("rxe: unloaded\n"); +} + +module_init(rxe_module_init); +module_exit(rxe_module_exit); diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h new file mode 100644 index 000000000000..12c71c549f97 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_H +#define RXE_H + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/crc32.h> + +#include <rdma/ib_verbs.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_pack.h> +#include <rdma/ib_smi.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_cache.h> +#include <rdma/ib_addr.h> + +#include "rxe_net.h" +#include "rxe_opcode.h" +#include "rxe_hdr.h" +#include "rxe_param.h" +#include "rxe_verbs.h" + +#define RXE_UVERBS_ABI_VERSION (1) + +#define IB_PHYS_STATE_LINK_UP (5) +#define IB_PHYS_STATE_LINK_DOWN (3) + +#define RXE_ROCE_V2_SPORT (0xc000) + +int rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); + +int rxe_add(struct rxe_dev *rxe, unsigned int mtu); +void rxe_remove(struct rxe_dev *rxe); +void rxe_remove_all(void); + +int rxe_rcv(struct sk_buff *skb); + +void rxe_dev_put(struct rxe_dev *rxe); +struct rxe_dev *net_to_rxe(struct net_device *ndev); +struct rxe_dev *get_rxe_by_name(const char* name); + +void rxe_port_up(struct rxe_dev *rxe); +void rxe_port_down(struct rxe_dev *rxe); + +#endif /* RXE_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c new file mode 100644 index 000000000000..5c9474212d4e --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_av.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +int rxe_av_chk_attr(struct rxe_dev *rxe, struct ib_ah_attr *attr) +{ + struct rxe_port *port; + + if (attr->port_num != 1) { + pr_info("rxe: invalid port_num = %d\n", attr->port_num); + return -EINVAL; + } + + port = &rxe->port; + + if (attr->ah_flags & IB_AH_GRH) { + if (attr->grh.sgid_index > port->attr.gid_tbl_len) { + pr_info("rxe: invalid sgid index = %d\n", + attr->grh.sgid_index); + return -EINVAL; + } + } + + return 0; +} + +int rxe_av_from_attr(struct rxe_dev *rxe, u8 port_num, + struct rxe_av *av, struct ib_ah_attr *attr) +{ + memset(av, 0, sizeof(*av)); + memcpy(&av->grh, &attr->grh, sizeof(attr->grh)); + av->port_num = port_num; + return 0; +} + +int rxe_av_to_attr(struct rxe_dev *rxe, struct rxe_av *av, + struct ib_ah_attr *attr) +{ + memcpy(&attr->grh, &av->grh, sizeof(av->grh)); + attr->port_num = av->port_num; + return 0; +} + +int rxe_av_fill_ip_info(struct rxe_dev *rxe, + struct rxe_av *av, + struct ib_ah_attr *attr, + struct ib_gid_attr *sgid_attr, + union ib_gid *sgid) +{ + rdma_gid2ip(&av->sgid_addr._sockaddr, sgid); + rdma_gid2ip(&av->dgid_addr._sockaddr, &attr->grh.dgid); + av->network_type = ib_gid_to_network_type(sgid_attr->gid_type, sgid); + + return 0; +} + +struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt) +{ + if (!pkt || !pkt->qp) + return NULL; + + if (qp_type(pkt->qp) == IB_QPT_RC || qp_type(pkt->qp) == IB_QPT_UC) + return &pkt->qp->pri_av; + + return (pkt->wqe) ? &pkt->wqe->av : NULL; +} diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c new file mode 100644 index 000000000000..36f67de44095 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -0,0 +1,734 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/skbuff.h> + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" +#include "rxe_task.h" + +enum comp_state { + COMPST_GET_ACK, + COMPST_GET_WQE, + COMPST_COMP_WQE, + COMPST_COMP_ACK, + COMPST_CHECK_PSN, + COMPST_CHECK_ACK, + COMPST_READ, + COMPST_ATOMIC, + COMPST_WRITE_SEND, + COMPST_UPDATE_COMP, + COMPST_ERROR_RETRY, + COMPST_RNR_RETRY, + COMPST_ERROR, + COMPST_EXIT, /* We have an issue, and we want to rerun the completer */ + COMPST_DONE, /* The completer finished successflly */ +}; + +static char *comp_state_name[] = { + [COMPST_GET_ACK] = "GET ACK", + [COMPST_GET_WQE] = "GET WQE", + [COMPST_COMP_WQE] = "COMP WQE", + [COMPST_COMP_ACK] = "COMP ACK", + [COMPST_CHECK_PSN] = "CHECK PSN", + [COMPST_CHECK_ACK] = "CHECK ACK", + [COMPST_READ] = "READ", + [COMPST_ATOMIC] = "ATOMIC", + [COMPST_WRITE_SEND] = "WRITE/SEND", + [COMPST_UPDATE_COMP] = "UPDATE COMP", + [COMPST_ERROR_RETRY] = "ERROR RETRY", + [COMPST_RNR_RETRY] = "RNR RETRY", + [COMPST_ERROR] = "ERROR", + [COMPST_EXIT] = "EXIT", + [COMPST_DONE] = "DONE", +}; + +static unsigned long rnrnak_usec[32] = { + [IB_RNR_TIMER_655_36] = 655360, + [IB_RNR_TIMER_000_01] = 10, + [IB_RNR_TIMER_000_02] = 20, + [IB_RNR_TIMER_000_03] = 30, + [IB_RNR_TIMER_000_04] = 40, + [IB_RNR_TIMER_000_06] = 60, + [IB_RNR_TIMER_000_08] = 80, + [IB_RNR_TIMER_000_12] = 120, + [IB_RNR_TIMER_000_16] = 160, + [IB_RNR_TIMER_000_24] = 240, + [IB_RNR_TIMER_000_32] = 320, + [IB_RNR_TIMER_000_48] = 480, + [IB_RNR_TIMER_000_64] = 640, + [IB_RNR_TIMER_000_96] = 960, + [IB_RNR_TIMER_001_28] = 1280, + [IB_RNR_TIMER_001_92] = 1920, + [IB_RNR_TIMER_002_56] = 2560, + [IB_RNR_TIMER_003_84] = 3840, + [IB_RNR_TIMER_005_12] = 5120, + [IB_RNR_TIMER_007_68] = 7680, + [IB_RNR_TIMER_010_24] = 10240, + [IB_RNR_TIMER_015_36] = 15360, + [IB_RNR_TIMER_020_48] = 20480, + [IB_RNR_TIMER_030_72] = 30720, + [IB_RNR_TIMER_040_96] = 40960, + [IB_RNR_TIMER_061_44] = 61410, + [IB_RNR_TIMER_081_92] = 81920, + [IB_RNR_TIMER_122_88] = 122880, + [IB_RNR_TIMER_163_84] = 163840, + [IB_RNR_TIMER_245_76] = 245760, + [IB_RNR_TIMER_327_68] = 327680, + [IB_RNR_TIMER_491_52] = 491520, +}; + +static inline unsigned long rnrnak_jiffies(u8 timeout) +{ + return max_t(unsigned long, + usecs_to_jiffies(rnrnak_usec[timeout]), 1); +} + +static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode) +{ + switch (opcode) { + case IB_WR_RDMA_WRITE: return IB_WC_RDMA_WRITE; + case IB_WR_RDMA_WRITE_WITH_IMM: return IB_WC_RDMA_WRITE; + case IB_WR_SEND: return IB_WC_SEND; + case IB_WR_SEND_WITH_IMM: return IB_WC_SEND; + case IB_WR_RDMA_READ: return IB_WC_RDMA_READ; + case IB_WR_ATOMIC_CMP_AND_SWP: return IB_WC_COMP_SWAP; + case IB_WR_ATOMIC_FETCH_AND_ADD: return IB_WC_FETCH_ADD; + case IB_WR_LSO: return IB_WC_LSO; + case IB_WR_SEND_WITH_INV: return IB_WC_SEND; + case IB_WR_RDMA_READ_WITH_INV: return IB_WC_RDMA_READ; + case IB_WR_LOCAL_INV: return IB_WC_LOCAL_INV; + case IB_WR_REG_MR: return IB_WC_REG_MR; + + default: + return 0xff; + } +} + +void retransmit_timer(unsigned long data) +{ + struct rxe_qp *qp = (struct rxe_qp *)data; + + if (qp->valid) { + qp->comp.timeout = 1; + rxe_run_task(&qp->comp.task, 1); + } +} + +void rxe_comp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp, + struct sk_buff *skb) +{ + int must_sched; + + skb_queue_tail(&qp->resp_pkts, skb); + + must_sched = skb_queue_len(&qp->resp_pkts) > 1; + rxe_run_task(&qp->comp.task, must_sched); +} + +static inline enum comp_state get_wqe(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe **wqe_p) +{ + struct rxe_send_wqe *wqe; + + /* we come here whether or not we found a response packet to see if + * there are any posted WQEs + */ + wqe = queue_head(qp->sq.queue); + *wqe_p = wqe; + + /* no WQE or requester has not started it yet */ + if (!wqe || wqe->state == wqe_state_posted) + return pkt ? COMPST_DONE : COMPST_EXIT; + + /* WQE does not require an ack */ + if (wqe->state == wqe_state_done) + return COMPST_COMP_WQE; + + /* WQE caused an error */ + if (wqe->state == wqe_state_error) + return COMPST_ERROR; + + /* we have a WQE, if we also have an ack check its PSN */ + return pkt ? COMPST_CHECK_PSN : COMPST_EXIT; +} + +static inline void reset_retry_counters(struct rxe_qp *qp) +{ + qp->comp.retry_cnt = qp->attr.retry_cnt; + qp->comp.rnr_retry = qp->attr.rnr_retry; +} + +static inline enum comp_state check_psn(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + s32 diff; + + /* check to see if response is past the oldest WQE. if it is, complete + * send/write or error read/atomic + */ + diff = psn_compare(pkt->psn, wqe->last_psn); + if (diff > 0) { + if (wqe->state == wqe_state_pending) { + if (wqe->mask & WR_ATOMIC_OR_READ_MASK) + return COMPST_ERROR_RETRY; + + reset_retry_counters(qp); + return COMPST_COMP_WQE; + } else { + return COMPST_DONE; + } + } + + /* compare response packet to expected response */ + diff = psn_compare(pkt->psn, qp->comp.psn); + if (diff < 0) { + /* response is most likely a retried packet if it matches an + * uncompleted WQE go complete it else ignore it + */ + if (pkt->psn == wqe->last_psn) + return COMPST_COMP_ACK; + else + return COMPST_DONE; + } else if ((diff > 0) && (wqe->mask & WR_ATOMIC_OR_READ_MASK)) { + return COMPST_ERROR_RETRY; + } else { + return COMPST_CHECK_ACK; + } +} + +static inline enum comp_state check_ack(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + unsigned int mask = pkt->mask; + u8 syn; + + /* Check the sequence only */ + switch (qp->comp.opcode) { + case -1: + /* Will catch all *_ONLY cases. */ + if (!(mask & RXE_START_MASK)) + return COMPST_ERROR; + + break; + + case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST: + case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE: + if (pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE && + pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) { + return COMPST_ERROR; + } + break; + default: + WARN_ON(1); + } + + /* Check operation validity. */ + switch (pkt->opcode) { + case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST: + case IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST: + case IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY: + syn = aeth_syn(pkt); + + if ((syn & AETH_TYPE_MASK) != AETH_ACK) + return COMPST_ERROR; + + /* Fall through (IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE + * doesn't have an AETH) + */ + case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE: + if (wqe->wr.opcode != IB_WR_RDMA_READ && + wqe->wr.opcode != IB_WR_RDMA_READ_WITH_INV) { + return COMPST_ERROR; + } + reset_retry_counters(qp); + return COMPST_READ; + + case IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE: + syn = aeth_syn(pkt); + + if ((syn & AETH_TYPE_MASK) != AETH_ACK) + return COMPST_ERROR; + + if (wqe->wr.opcode != IB_WR_ATOMIC_CMP_AND_SWP && + wqe->wr.opcode != IB_WR_ATOMIC_FETCH_AND_ADD) + return COMPST_ERROR; + reset_retry_counters(qp); + return COMPST_ATOMIC; + + case IB_OPCODE_RC_ACKNOWLEDGE: + syn = aeth_syn(pkt); + switch (syn & AETH_TYPE_MASK) { + case AETH_ACK: + reset_retry_counters(qp); + return COMPST_WRITE_SEND; + + case AETH_RNR_NAK: + return COMPST_RNR_RETRY; + + case AETH_NAK: + switch (syn) { + case AETH_NAK_PSN_SEQ_ERROR: + /* a nak implicitly acks all packets with psns + * before + */ + if (psn_compare(pkt->psn, qp->comp.psn) > 0) { + qp->comp.psn = pkt->psn; + if (qp->req.wait_psn) { + qp->req.wait_psn = 0; + rxe_run_task(&qp->req.task, 1); + } + } + return COMPST_ERROR_RETRY; + + case AETH_NAK_INVALID_REQ: + wqe->status = IB_WC_REM_INV_REQ_ERR; + return COMPST_ERROR; + + case AETH_NAK_REM_ACC_ERR: + wqe->status = IB_WC_REM_ACCESS_ERR; + return COMPST_ERROR; + + case AETH_NAK_REM_OP_ERR: + wqe->status = IB_WC_REM_OP_ERR; + return COMPST_ERROR; + + default: + pr_warn("unexpected nak %x\n", syn); + wqe->status = IB_WC_REM_OP_ERR; + return COMPST_ERROR; + } + + default: + return COMPST_ERROR; + } + break; + + default: + pr_warn("unexpected opcode\n"); + } + + return COMPST_ERROR; +} + +static inline enum comp_state do_read(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + int ret; + + ret = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE, + &wqe->dma, payload_addr(pkt), + payload_size(pkt), to_mem_obj, NULL); + if (ret) + return COMPST_ERROR; + + if (wqe->dma.resid == 0 && (pkt->mask & RXE_END_MASK)) + return COMPST_COMP_ACK; + else + return COMPST_UPDATE_COMP; +} + +static inline enum comp_state do_atomic(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + int ret; + + u64 atomic_orig = atmack_orig(pkt); + + ret = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE, + &wqe->dma, &atomic_orig, + sizeof(u64), to_mem_obj, NULL); + if (ret) + return COMPST_ERROR; + else + return COMPST_COMP_ACK; +} + +static void make_send_cqe(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + struct rxe_cqe *cqe) +{ + memset(cqe, 0, sizeof(*cqe)); + + if (!qp->is_user) { + struct ib_wc *wc = &cqe->ibwc; + + wc->wr_id = wqe->wr.wr_id; + wc->status = wqe->status; + wc->opcode = wr_to_wc_opcode(wqe->wr.opcode); + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM || + wqe->wr.opcode == IB_WR_SEND_WITH_IMM) + wc->wc_flags = IB_WC_WITH_IMM; + wc->byte_len = wqe->dma.length; + wc->qp = &qp->ibqp; + } else { + struct ib_uverbs_wc *uwc = &cqe->uibwc; + + uwc->wr_id = wqe->wr.wr_id; + uwc->status = wqe->status; + uwc->opcode = wr_to_wc_opcode(wqe->wr.opcode); + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM || + wqe->wr.opcode == IB_WR_SEND_WITH_IMM) + uwc->wc_flags = IB_WC_WITH_IMM; + uwc->byte_len = wqe->dma.length; + uwc->qp_num = qp->ibqp.qp_num; + } +} + +static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe) +{ + struct rxe_cqe cqe; + + if ((qp->sq_sig_type == IB_SIGNAL_ALL_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED) || + (qp->req.state == QP_STATE_ERROR)) { + make_send_cqe(qp, wqe, &cqe); + rxe_cq_post(qp->scq, &cqe, 0); + } + + advance_consumer(qp->sq.queue); + + /* + * we completed something so let req run again + * if it is trying to fence + */ + if (qp->req.wait_fence) { + qp->req.wait_fence = 0; + rxe_run_task(&qp->req.task, 1); + } +} + +static inline enum comp_state complete_ack(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + unsigned long flags; + + if (wqe->has_rd_atomic) { + wqe->has_rd_atomic = 0; + atomic_inc(&qp->req.rd_atomic); + if (qp->req.need_rd_atomic) { + qp->comp.timeout_retry = 0; + qp->req.need_rd_atomic = 0; + rxe_run_task(&qp->req.task, 1); + } + } + + if (unlikely(qp->req.state == QP_STATE_DRAIN)) { + /* state_lock used by requester & completer */ + spin_lock_irqsave(&qp->state_lock, flags); + if ((qp->req.state == QP_STATE_DRAIN) && + (qp->comp.psn == qp->req.psn)) { + qp->req.state = QP_STATE_DRAINED; + spin_unlock_irqrestore(&qp->state_lock, flags); + + if (qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_SQ_DRAINED; + qp->ibqp.event_handler(&ev, + qp->ibqp.qp_context); + } + } else { + spin_unlock_irqrestore(&qp->state_lock, flags); + } + } + + do_complete(qp, wqe); + + if (psn_compare(pkt->psn, qp->comp.psn) >= 0) + return COMPST_UPDATE_COMP; + else + return COMPST_DONE; +} + +static inline enum comp_state complete_wqe(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + qp->comp.opcode = -1; + + if (pkt) { + if (psn_compare(pkt->psn, qp->comp.psn) >= 0) + qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + + if (qp->req.wait_psn) { + qp->req.wait_psn = 0; + rxe_run_task(&qp->req.task, 1); + } + } + + do_complete(qp, wqe); + + return COMPST_GET_WQE; +} + +int rxe_completer(void *arg) +{ + struct rxe_qp *qp = (struct rxe_qp *)arg; + struct rxe_send_wqe *wqe = wqe; + struct sk_buff *skb = NULL; + struct rxe_pkt_info *pkt = NULL; + enum comp_state state; + + if (!qp->valid) { + while ((skb = skb_dequeue(&qp->resp_pkts))) { + rxe_drop_ref(qp); + kfree_skb(skb); + } + skb = NULL; + pkt = NULL; + + while (queue_head(qp->sq.queue)) + advance_consumer(qp->sq.queue); + + goto exit; + } + + if (qp->req.state == QP_STATE_ERROR) { + while ((skb = skb_dequeue(&qp->resp_pkts))) { + rxe_drop_ref(qp); + kfree_skb(skb); + } + skb = NULL; + pkt = NULL; + + while ((wqe = queue_head(qp->sq.queue))) { + wqe->status = IB_WC_WR_FLUSH_ERR; + do_complete(qp, wqe); + } + + goto exit; + } + + if (qp->req.state == QP_STATE_RESET) { + while ((skb = skb_dequeue(&qp->resp_pkts))) { + rxe_drop_ref(qp); + kfree_skb(skb); + } + skb = NULL; + pkt = NULL; + + while (queue_head(qp->sq.queue)) + advance_consumer(qp->sq.queue); + + goto exit; + } + + if (qp->comp.timeout) { + qp->comp.timeout_retry = 1; + qp->comp.timeout = 0; + } else { + qp->comp.timeout_retry = 0; + } + + if (qp->req.need_retry) + goto exit; + + state = COMPST_GET_ACK; + + while (1) { + pr_debug("state = %s\n", comp_state_name[state]); + switch (state) { + case COMPST_GET_ACK: + skb = skb_dequeue(&qp->resp_pkts); + if (skb) { + pkt = SKB_TO_PKT(skb); + qp->comp.timeout_retry = 0; + } + state = COMPST_GET_WQE; + break; + + case COMPST_GET_WQE: + state = get_wqe(qp, pkt, &wqe); + break; + + case COMPST_CHECK_PSN: + state = check_psn(qp, pkt, wqe); + break; + + case COMPST_CHECK_ACK: + state = check_ack(qp, pkt, wqe); + break; + + case COMPST_READ: + state = do_read(qp, pkt, wqe); + break; + + case COMPST_ATOMIC: + state = do_atomic(qp, pkt, wqe); + break; + + case COMPST_WRITE_SEND: + if (wqe->state == wqe_state_pending && + wqe->last_psn == pkt->psn) + state = COMPST_COMP_ACK; + else + state = COMPST_UPDATE_COMP; + break; + + case COMPST_COMP_ACK: + state = complete_ack(qp, pkt, wqe); + break; + + case COMPST_COMP_WQE: + state = complete_wqe(qp, pkt, wqe); + break; + + case COMPST_UPDATE_COMP: + if (pkt->mask & RXE_END_MASK) + qp->comp.opcode = -1; + else + qp->comp.opcode = pkt->opcode; + + if (psn_compare(pkt->psn, qp->comp.psn) >= 0) + qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + + if (qp->req.wait_psn) { + qp->req.wait_psn = 0; + rxe_run_task(&qp->req.task, 1); + } + + state = COMPST_DONE; + break; + + case COMPST_DONE: + if (pkt) { + rxe_drop_ref(pkt->qp); + kfree_skb(skb); + } + goto done; + + case COMPST_EXIT: + if (qp->comp.timeout_retry && wqe) { + state = COMPST_ERROR_RETRY; + break; + } + + /* re reset the timeout counter if + * (1) QP is type RC + * (2) the QP is alive + * (3) there is a packet sent by the requester that + * might be acked (we still might get spurious + * timeouts but try to keep them as few as possible) + * (4) the timeout parameter is set + */ + if ((qp_type(qp) == IB_QPT_RC) && + (qp->req.state == QP_STATE_READY) && + (psn_compare(qp->req.psn, qp->comp.psn) > 0) && + qp->qp_timeout_jiffies) + mod_timer(&qp->retrans_timer, + jiffies + qp->qp_timeout_jiffies); + goto exit; + + case COMPST_ERROR_RETRY: + /* we come here if the retry timer fired and we did + * not receive a response packet. try to retry the send + * queue if that makes sense and the limits have not + * been exceeded. remember that some timeouts are + * spurious since we do not reset the timer but kick + * it down the road or let it expire + */ + + /* there is nothing to retry in this case */ + if (!wqe || (wqe->state == wqe_state_posted)) + goto exit; + + if (qp->comp.retry_cnt > 0) { + if (qp->comp.retry_cnt != 7) + qp->comp.retry_cnt--; + + /* no point in retrying if we have already + * seen the last ack that the requester could + * have caused + */ + if (psn_compare(qp->req.psn, + qp->comp.psn) > 0) { + /* tell the requester to retry the + * send send queue next time around + */ + qp->req.need_retry = 1; + rxe_run_task(&qp->req.task, 1); + } + goto exit; + } else { + wqe->status = IB_WC_RETRY_EXC_ERR; + state = COMPST_ERROR; + } + break; + + case COMPST_RNR_RETRY: + if (qp->comp.rnr_retry > 0) { + if (qp->comp.rnr_retry != 7) + qp->comp.rnr_retry--; + + qp->req.need_retry = 1; + pr_debug("set rnr nak timer\n"); + mod_timer(&qp->rnr_nak_timer, + jiffies + rnrnak_jiffies(aeth_syn(pkt) + & ~AETH_TYPE_MASK)); + goto exit; + } else { + wqe->status = IB_WC_RNR_RETRY_EXC_ERR; + state = COMPST_ERROR; + } + break; + + case COMPST_ERROR: + do_complete(qp, wqe); + rxe_qp_error(qp); + goto exit; + } + } + +exit: + /* we come here if we are done with processing and want the task to + * exit from the loop calling us + */ + return -EAGAIN; + +done: + /* we come here if we have processed a packet we want the task to call + * us again to see if there is anything else to do + */ + return 0; +} diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c new file mode 100644 index 000000000000..e5e6a5e7dee9 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, + int cqe, int comp_vector, struct ib_udata *udata) +{ + int count; + + if (cqe <= 0) { + pr_warn("cqe(%d) <= 0\n", cqe); + goto err1; + } + + if (cqe > rxe->attr.max_cqe) { + pr_warn("cqe(%d) > max_cqe(%d)\n", + cqe, rxe->attr.max_cqe); + goto err1; + } + + if (cq) { + count = queue_count(cq->queue); + if (cqe < count) { + pr_warn("cqe(%d) < current # elements in queue (%d)", + cqe, count); + goto err1; + } + } + + return 0; + +err1: + return -EINVAL; +} + +static void rxe_send_complete(unsigned long data) +{ + struct rxe_cq *cq = (struct rxe_cq *)data; + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); +} + +int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, + int comp_vector, struct ib_ucontext *context, + struct ib_udata *udata) +{ + int err; + + cq->queue = rxe_queue_init(rxe, &cqe, + sizeof(struct rxe_cqe)); + if (!cq->queue) { + pr_warn("unable to create cq\n"); + return -ENOMEM; + } + + err = do_mmap_info(rxe, udata, false, context, cq->queue->buf, + cq->queue->buf_size, &cq->queue->ip); + if (err) { + kvfree(cq->queue->buf); + kfree(cq->queue); + return err; + } + + if (udata) + cq->is_user = 1; + + tasklet_init(&cq->comp_task, rxe_send_complete, (unsigned long)cq); + + spin_lock_init(&cq->cq_lock); + cq->ibcq.cqe = cqe; + return 0; +} + +int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe, struct ib_udata *udata) +{ + int err; + + err = rxe_queue_resize(cq->queue, (unsigned int *)&cqe, + sizeof(struct rxe_cqe), + cq->queue->ip ? cq->queue->ip->context : NULL, + udata, NULL, &cq->cq_lock); + if (!err) + cq->ibcq.cqe = cqe; + + return err; +} + +int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) +{ + struct ib_event ev; + unsigned long flags; + + spin_lock_irqsave(&cq->cq_lock, flags); + + if (unlikely(queue_full(cq->queue))) { + spin_unlock_irqrestore(&cq->cq_lock, flags); + if (cq->ibcq.event_handler) { + ev.device = cq->ibcq.device; + ev.element.cq = &cq->ibcq; + ev.event = IB_EVENT_CQ_ERR; + cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); + } + + return -EBUSY; + } + + memcpy(producer_addr(cq->queue), cqe, sizeof(*cqe)); + + /* make sure all changes to the CQ are written before we update the + * producer pointer + */ + smp_wmb(); + + advance_producer(cq->queue); + spin_unlock_irqrestore(&cq->cq_lock, flags); + + if ((cq->notify == IB_CQ_NEXT_COMP) || + (cq->notify == IB_CQ_SOLICITED && solicited)) { + cq->notify = 0; + tasklet_schedule(&cq->comp_task); + } + + return 0; +} + +void rxe_cq_cleanup(void *arg) +{ + struct rxe_cq *cq = arg; + + if (cq->queue) + rxe_queue_cleanup(cq->queue); +} diff --git a/drivers/infiniband/sw/rxe/rxe_dma.c b/drivers/infiniband/sw/rxe/rxe_dma.c new file mode 100644 index 000000000000..7634c1a81b2b --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_dma.c @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +#define DMA_BAD_ADDER ((u64)0) + +static int rxe_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + return dma_addr == DMA_BAD_ADDER; +} + +static u64 rxe_dma_map_single(struct ib_device *dev, + void *cpu_addr, size_t size, + enum dma_data_direction direction) +{ + WARN_ON(!valid_dma_direction(direction)); + return (uintptr_t)cpu_addr; +} + +static void rxe_dma_unmap_single(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + WARN_ON(!valid_dma_direction(direction)); +} + +static u64 rxe_dma_map_page(struct ib_device *dev, + struct page *page, + unsigned long offset, + size_t size, enum dma_data_direction direction) +{ + u64 addr; + + WARN_ON(!valid_dma_direction(direction)); + + if (offset + size > PAGE_SIZE) { + addr = DMA_BAD_ADDER; + goto done; + } + + addr = (uintptr_t)page_address(page); + if (addr) + addr += offset; + +done: + return addr; +} + +static void rxe_dma_unmap_page(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + WARN_ON(!valid_dma_direction(direction)); +} + +static int rxe_map_sg(struct ib_device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction) +{ + struct scatterlist *sg; + u64 addr; + int i; + int ret = nents; + + WARN_ON(!valid_dma_direction(direction)); + + for_each_sg(sgl, sg, nents, i) { + addr = (uintptr_t)page_address(sg_page(sg)); + if (!addr) { + ret = 0; + break; + } + sg->dma_address = addr + sg->offset; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->dma_length = sg->length; +#endif + } + + return ret; +} + +static void rxe_unmap_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + WARN_ON(!valid_dma_direction(direction)); +} + +static void rxe_sync_single_for_cpu(struct ib_device *dev, + u64 addr, + size_t size, enum dma_data_direction dir) +{ +} + +static void rxe_sync_single_for_device(struct ib_device *dev, + u64 addr, + size_t size, enum dma_data_direction dir) +{ +} + +static void *rxe_dma_alloc_coherent(struct ib_device *dev, size_t size, + u64 *dma_handle, gfp_t flag) +{ + struct page *p; + void *addr = NULL; + + p = alloc_pages(flag, get_order(size)); + if (p) + addr = page_address(p); + + if (dma_handle) + *dma_handle = (uintptr_t)addr; + + return addr; +} + +static void rxe_dma_free_coherent(struct ib_device *dev, size_t size, + void *cpu_addr, u64 dma_handle) +{ + free_pages((unsigned long)cpu_addr, get_order(size)); +} + +struct ib_dma_mapping_ops rxe_dma_mapping_ops = { + .mapping_error = rxe_mapping_error, + .map_single = rxe_dma_map_single, + .unmap_single = rxe_dma_unmap_single, + .map_page = rxe_dma_map_page, + .unmap_page = rxe_dma_unmap_page, + .map_sg = rxe_map_sg, + .unmap_sg = rxe_unmap_sg, + .sync_single_for_cpu = rxe_sync_single_for_cpu, + .sync_single_for_device = rxe_sync_single_for_device, + .alloc_coherent = rxe_dma_alloc_coherent, + .free_coherent = rxe_dma_free_coherent +}; diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h new file mode 100644 index 000000000000..d57b5e956ceb --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_hdr.h @@ -0,0 +1,952 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_HDR_H +#define RXE_HDR_H + +/* extracted information about a packet carried in an sk_buff struct fits in + * the skbuff cb array. Must be at most 48 bytes. stored in control block of + * sk_buff for received packets. + */ +struct rxe_pkt_info { + struct rxe_dev *rxe; /* device that owns packet */ + struct rxe_qp *qp; /* qp that owns packet */ + struct rxe_send_wqe *wqe; /* send wqe */ + u8 *hdr; /* points to bth */ + u32 mask; /* useful info about pkt */ + u32 psn; /* bth psn of packet */ + u16 pkey_index; /* partition of pkt */ + u16 paylen; /* length of bth - icrc */ + u8 port_num; /* port pkt received on */ + u8 opcode; /* bth opcode of packet */ + u8 offset; /* bth offset from pkt->hdr */ +}; + +/* Macros should be used only for received skb */ +#define SKB_TO_PKT(skb) ((struct rxe_pkt_info *)(skb)->cb) +#define PKT_TO_SKB(pkt) container_of((void *)(pkt), struct sk_buff, cb) + +/* + * IBA header types and methods + * + * Some of these are for reference and completeness only since + * rxe does not currently support RD transport + * most of this could be moved into IB core. ib_pack.h has + * part of this but is incomplete + * + * Header specific routines to insert/extract values to/from headers + * the routines that are named __hhh_(set_)fff() take a pointer to a + * hhh header and get(set) the fff field. The routines named + * hhh_(set_)fff take a packet info struct and find the + * header and field based on the opcode in the packet. + * Conversion to/from network byte order from cpu order is also done. + */ + +#define RXE_ICRC_SIZE (4) +#define RXE_MAX_HDR_LENGTH (80) + +/****************************************************************************** + * Base Transport Header + ******************************************************************************/ +struct rxe_bth { + u8 opcode; + u8 flags; + __be16 pkey; + __be32 qpn; + __be32 apsn; +}; + +#define BTH_TVER (0) +#define BTH_DEF_PKEY (0xffff) + +#define BTH_SE_MASK (0x80) +#define BTH_MIG_MASK (0x40) +#define BTH_PAD_MASK (0x30) +#define BTH_TVER_MASK (0x0f) +#define BTH_FECN_MASK (0x80000000) +#define BTH_BECN_MASK (0x40000000) +#define BTH_RESV6A_MASK (0x3f000000) +#define BTH_QPN_MASK (0x00ffffff) +#define BTH_ACK_MASK (0x80000000) +#define BTH_RESV7_MASK (0x7f000000) +#define BTH_PSN_MASK (0x00ffffff) + +static inline u8 __bth_opcode(void *arg) +{ + struct rxe_bth *bth = arg; + + return bth->opcode; +} + +static inline void __bth_set_opcode(void *arg, u8 opcode) +{ + struct rxe_bth *bth = arg; + + bth->opcode = opcode; +} + +static inline u8 __bth_se(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (BTH_SE_MASK & bth->flags); +} + +static inline void __bth_set_se(void *arg, int se) +{ + struct rxe_bth *bth = arg; + + if (se) + bth->flags |= BTH_SE_MASK; + else + bth->flags &= ~BTH_SE_MASK; +} + +static inline u8 __bth_mig(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (BTH_MIG_MASK & bth->flags); +} + +static inline void __bth_set_mig(void *arg, u8 mig) +{ + struct rxe_bth *bth = arg; + + if (mig) + bth->flags |= BTH_MIG_MASK; + else + bth->flags &= ~BTH_MIG_MASK; +} + +static inline u8 __bth_pad(void *arg) +{ + struct rxe_bth *bth = arg; + + return (BTH_PAD_MASK & bth->flags) >> 4; +} + +static inline void __bth_set_pad(void *arg, u8 pad) +{ + struct rxe_bth *bth = arg; + + bth->flags = (BTH_PAD_MASK & (pad << 4)) | + (~BTH_PAD_MASK & bth->flags); +} + +static inline u8 __bth_tver(void *arg) +{ + struct rxe_bth *bth = arg; + + return BTH_TVER_MASK & bth->flags; +} + +static inline void __bth_set_tver(void *arg, u8 tver) +{ + struct rxe_bth *bth = arg; + + bth->flags = (BTH_TVER_MASK & tver) | + (~BTH_TVER_MASK & bth->flags); +} + +static inline u16 __bth_pkey(void *arg) +{ + struct rxe_bth *bth = arg; + + return be16_to_cpu(bth->pkey); +} + +static inline void __bth_set_pkey(void *arg, u16 pkey) +{ + struct rxe_bth *bth = arg; + + bth->pkey = cpu_to_be16(pkey); +} + +static inline u32 __bth_qpn(void *arg) +{ + struct rxe_bth *bth = arg; + + return BTH_QPN_MASK & be32_to_cpu(bth->qpn); +} + +static inline void __bth_set_qpn(void *arg, u32 qpn) +{ + struct rxe_bth *bth = arg; + u32 resvqpn = be32_to_cpu(bth->qpn); + + bth->qpn = cpu_to_be32((BTH_QPN_MASK & qpn) | + (~BTH_QPN_MASK & resvqpn)); +} + +static inline int __bth_fecn(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (cpu_to_be32(BTH_FECN_MASK) & bth->qpn); +} + +static inline void __bth_set_fecn(void *arg, int fecn) +{ + struct rxe_bth *bth = arg; + + if (fecn) + bth->qpn |= cpu_to_be32(BTH_FECN_MASK); + else + bth->qpn &= ~cpu_to_be32(BTH_FECN_MASK); +} + +static inline int __bth_becn(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (cpu_to_be32(BTH_BECN_MASK) & bth->qpn); +} + +static inline void __bth_set_becn(void *arg, int becn) +{ + struct rxe_bth *bth = arg; + + if (becn) + bth->qpn |= cpu_to_be32(BTH_BECN_MASK); + else + bth->qpn &= ~cpu_to_be32(BTH_BECN_MASK); +} + +static inline u8 __bth_resv6a(void *arg) +{ + struct rxe_bth *bth = arg; + + return (BTH_RESV6A_MASK & be32_to_cpu(bth->qpn)) >> 24; +} + +static inline void __bth_set_resv6a(void *arg) +{ + struct rxe_bth *bth = arg; + + bth->qpn = cpu_to_be32(~BTH_RESV6A_MASK); +} + +static inline int __bth_ack(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (cpu_to_be32(BTH_ACK_MASK) & bth->apsn); +} + +static inline void __bth_set_ack(void *arg, int ack) +{ + struct rxe_bth *bth = arg; + + if (ack) + bth->apsn |= cpu_to_be32(BTH_ACK_MASK); + else + bth->apsn &= ~cpu_to_be32(BTH_ACK_MASK); +} + +static inline void __bth_set_resv7(void *arg) +{ + struct rxe_bth *bth = arg; + + bth->apsn &= ~cpu_to_be32(BTH_RESV7_MASK); +} + +static inline u32 __bth_psn(void *arg) +{ + struct rxe_bth *bth = arg; + + return BTH_PSN_MASK & be32_to_cpu(bth->apsn); +} + +static inline void __bth_set_psn(void *arg, u32 psn) +{ + struct rxe_bth *bth = arg; + u32 apsn = be32_to_cpu(bth->apsn); + + bth->apsn = cpu_to_be32((BTH_PSN_MASK & psn) | + (~BTH_PSN_MASK & apsn)); +} + +static inline u8 bth_opcode(struct rxe_pkt_info *pkt) +{ + return __bth_opcode(pkt->hdr + pkt->offset); +} + +static inline void bth_set_opcode(struct rxe_pkt_info *pkt, u8 opcode) +{ + __bth_set_opcode(pkt->hdr + pkt->offset, opcode); +} + +static inline u8 bth_se(struct rxe_pkt_info *pkt) +{ + return __bth_se(pkt->hdr + pkt->offset); +} + +static inline void bth_set_se(struct rxe_pkt_info *pkt, int se) +{ + __bth_set_se(pkt->hdr + pkt->offset, se); +} + +static inline u8 bth_mig(struct rxe_pkt_info *pkt) +{ + return __bth_mig(pkt->hdr + pkt->offset); +} + +static inline void bth_set_mig(struct rxe_pkt_info *pkt, u8 mig) +{ + __bth_set_mig(pkt->hdr + pkt->offset, mig); +} + +static inline u8 bth_pad(struct rxe_pkt_info *pkt) +{ + return __bth_pad(pkt->hdr + pkt->offset); +} + +static inline void bth_set_pad(struct rxe_pkt_info *pkt, u8 pad) +{ + __bth_set_pad(pkt->hdr + pkt->offset, pad); +} + +static inline u8 bth_tver(struct rxe_pkt_info *pkt) +{ + return __bth_tver(pkt->hdr + pkt->offset); +} + +static inline void bth_set_tver(struct rxe_pkt_info *pkt, u8 tver) +{ + __bth_set_tver(pkt->hdr + pkt->offset, tver); +} + +static inline u16 bth_pkey(struct rxe_pkt_info *pkt) +{ + return __bth_pkey(pkt->hdr + pkt->offset); +} + +static inline void bth_set_pkey(struct rxe_pkt_info *pkt, u16 pkey) +{ + __bth_set_pkey(pkt->hdr + pkt->offset, pkey); +} + +static inline u32 bth_qpn(struct rxe_pkt_info *pkt) +{ + return __bth_qpn(pkt->hdr + pkt->offset); +} + +static inline void bth_set_qpn(struct rxe_pkt_info *pkt, u32 qpn) +{ + __bth_set_qpn(pkt->hdr + pkt->offset, qpn); +} + +static inline int bth_fecn(struct rxe_pkt_info *pkt) +{ + return __bth_fecn(pkt->hdr + pkt->offset); +} + +static inline void bth_set_fecn(struct rxe_pkt_info *pkt, int fecn) +{ + __bth_set_fecn(pkt->hdr + pkt->offset, fecn); +} + +static inline int bth_becn(struct rxe_pkt_info *pkt) +{ + return __bth_becn(pkt->hdr + pkt->offset); +} + +static inline void bth_set_becn(struct rxe_pkt_info *pkt, int becn) +{ + __bth_set_becn(pkt->hdr + pkt->offset, becn); +} + +static inline u8 bth_resv6a(struct rxe_pkt_info *pkt) +{ + return __bth_resv6a(pkt->hdr + pkt->offset); +} + +static inline void bth_set_resv6a(struct rxe_pkt_info *pkt) +{ + __bth_set_resv6a(pkt->hdr + pkt->offset); +} + +static inline int bth_ack(struct rxe_pkt_info *pkt) +{ + return __bth_ack(pkt->hdr + pkt->offset); +} + +static inline void bth_set_ack(struct rxe_pkt_info *pkt, int ack) +{ + __bth_set_ack(pkt->hdr + pkt->offset, ack); +} + +static inline void bth_set_resv7(struct rxe_pkt_info *pkt) +{ + __bth_set_resv7(pkt->hdr + pkt->offset); +} + +static inline u32 bth_psn(struct rxe_pkt_info *pkt) +{ + return __bth_psn(pkt->hdr + pkt->offset); +} + +static inline void bth_set_psn(struct rxe_pkt_info *pkt, u32 psn) +{ + __bth_set_psn(pkt->hdr + pkt->offset, psn); +} + +static inline void bth_init(struct rxe_pkt_info *pkt, u8 opcode, int se, + int mig, int pad, u16 pkey, u32 qpn, int ack_req, + u32 psn) +{ + struct rxe_bth *bth = (struct rxe_bth *)(pkt->hdr + pkt->offset); + + bth->opcode = opcode; + bth->flags = (pad << 4) & BTH_PAD_MASK; + if (se) + bth->flags |= BTH_SE_MASK; + if (mig) + bth->flags |= BTH_MIG_MASK; + bth->pkey = cpu_to_be16(pkey); + bth->qpn = cpu_to_be32(qpn & BTH_QPN_MASK); + psn &= BTH_PSN_MASK; + if (ack_req) + psn |= BTH_ACK_MASK; + bth->apsn = cpu_to_be32(psn); +} + +/****************************************************************************** + * Reliable Datagram Extended Transport Header + ******************************************************************************/ +struct rxe_rdeth { + __be32 een; +}; + +#define RDETH_EEN_MASK (0x00ffffff) + +static inline u8 __rdeth_een(void *arg) +{ + struct rxe_rdeth *rdeth = arg; + + return RDETH_EEN_MASK & be32_to_cpu(rdeth->een); +} + +static inline void __rdeth_set_een(void *arg, u32 een) +{ + struct rxe_rdeth *rdeth = arg; + + rdeth->een = cpu_to_be32(RDETH_EEN_MASK & een); +} + +static inline u8 rdeth_een(struct rxe_pkt_info *pkt) +{ + return __rdeth_een(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RDETH]); +} + +static inline void rdeth_set_een(struct rxe_pkt_info *pkt, u32 een) +{ + __rdeth_set_een(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RDETH], een); +} + +/****************************************************************************** + * Datagram Extended Transport Header + ******************************************************************************/ +struct rxe_deth { + __be32 qkey; + __be32 sqp; +}; + +#define GSI_QKEY (0x80010000) +#define DETH_SQP_MASK (0x00ffffff) + +static inline u32 __deth_qkey(void *arg) +{ + struct rxe_deth *deth = arg; + + return be32_to_cpu(deth->qkey); +} + +static inline void __deth_set_qkey(void *arg, u32 qkey) +{ + struct rxe_deth *deth = arg; + + deth->qkey = cpu_to_be32(qkey); +} + +static inline u32 __deth_sqp(void *arg) +{ + struct rxe_deth *deth = arg; + + return DETH_SQP_MASK & be32_to_cpu(deth->sqp); +} + +static inline void __deth_set_sqp(void *arg, u32 sqp) +{ + struct rxe_deth *deth = arg; + + deth->sqp = cpu_to_be32(DETH_SQP_MASK & sqp); +} + +static inline u32 deth_qkey(struct rxe_pkt_info *pkt) +{ + return __deth_qkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_DETH]); +} + +static inline void deth_set_qkey(struct rxe_pkt_info *pkt, u32 qkey) +{ + __deth_set_qkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_DETH], qkey); +} + +static inline u32 deth_sqp(struct rxe_pkt_info *pkt) +{ + return __deth_sqp(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_DETH]); +} + +static inline void deth_set_sqp(struct rxe_pkt_info *pkt, u32 sqp) +{ + __deth_set_sqp(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_DETH], sqp); +} + +/****************************************************************************** + * RDMA Extended Transport Header + ******************************************************************************/ +struct rxe_reth { + __be64 va; + __be32 rkey; + __be32 len; +}; + +static inline u64 __reth_va(void *arg) +{ + struct rxe_reth *reth = arg; + + return be64_to_cpu(reth->va); +} + +static inline void __reth_set_va(void *arg, u64 va) +{ + struct rxe_reth *reth = arg; + + reth->va = cpu_to_be64(va); +} + +static inline u32 __reth_rkey(void *arg) +{ + struct rxe_reth *reth = arg; + + return be32_to_cpu(reth->rkey); +} + +static inline void __reth_set_rkey(void *arg, u32 rkey) +{ + struct rxe_reth *reth = arg; + + reth->rkey = cpu_to_be32(rkey); +} + +static inline u32 __reth_len(void *arg) +{ + struct rxe_reth *reth = arg; + + return be32_to_cpu(reth->len); +} + +static inline void __reth_set_len(void *arg, u32 len) +{ + struct rxe_reth *reth = arg; + + reth->len = cpu_to_be32(len); +} + +static inline u64 reth_va(struct rxe_pkt_info *pkt) +{ + return __reth_va(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RETH]); +} + +static inline void reth_set_va(struct rxe_pkt_info *pkt, u64 va) +{ + __reth_set_va(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RETH], va); +} + +static inline u32 reth_rkey(struct rxe_pkt_info *pkt) +{ + return __reth_rkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RETH]); +} + +static inline void reth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey) +{ + __reth_set_rkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RETH], rkey); +} + +static inline u32 reth_len(struct rxe_pkt_info *pkt) +{ + return __reth_len(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RETH]); +} + +static inline void reth_set_len(struct rxe_pkt_info *pkt, u32 len) +{ + __reth_set_len(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RETH], len); +} + +/****************************************************************************** + * Atomic Extended Transport Header + ******************************************************************************/ +struct rxe_atmeth { + __be64 va; + __be32 rkey; + __be64 swap_add; + __be64 comp; +} __attribute__((__packed__)); + +static inline u64 __atmeth_va(void *arg) +{ + struct rxe_atmeth *atmeth = arg; + + return be64_to_cpu(atmeth->va); +} + +static inline void __atmeth_set_va(void *arg, u64 va) +{ + struct rxe_atmeth *atmeth = arg; + + atmeth->va = cpu_to_be64(va); +} + +static inline u32 __atmeth_rkey(void *arg) +{ + struct rxe_atmeth *atmeth = arg; + + return be32_to_cpu(atmeth->rkey); +} + +static inline void __atmeth_set_rkey(void *arg, u32 rkey) +{ + struct rxe_atmeth *atmeth = arg; + + atmeth->rkey = cpu_to_be32(rkey); +} + +static inline u64 __atmeth_swap_add(void *arg) +{ + struct rxe_atmeth *atmeth = arg; + + return be64_to_cpu(atmeth->swap_add); +} + +static inline void __atmeth_set_swap_add(void *arg, u64 swap_add) +{ + struct rxe_atmeth *atmeth = arg; + + atmeth->swap_add = cpu_to_be64(swap_add); +} + +static inline u64 __atmeth_comp(void *arg) +{ + struct rxe_atmeth *atmeth = arg; + + return be64_to_cpu(atmeth->comp); +} + +static inline void __atmeth_set_comp(void *arg, u64 comp) +{ + struct rxe_atmeth *atmeth = arg; + + atmeth->comp = cpu_to_be64(comp); +} + +static inline u64 atmeth_va(struct rxe_pkt_info *pkt) +{ + return __atmeth_va(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); +} + +static inline void atmeth_set_va(struct rxe_pkt_info *pkt, u64 va) +{ + __atmeth_set_va(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], va); +} + +static inline u32 atmeth_rkey(struct rxe_pkt_info *pkt) +{ + return __atmeth_rkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); +} + +static inline void atmeth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey) +{ + __atmeth_set_rkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], rkey); +} + +static inline u64 atmeth_swap_add(struct rxe_pkt_info *pkt) +{ + return __atmeth_swap_add(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); +} + +static inline void atmeth_set_swap_add(struct rxe_pkt_info *pkt, u64 swap_add) +{ + __atmeth_set_swap_add(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], swap_add); +} + +static inline u64 atmeth_comp(struct rxe_pkt_info *pkt) +{ + return __atmeth_comp(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); +} + +static inline void atmeth_set_comp(struct rxe_pkt_info *pkt, u64 comp) +{ + __atmeth_set_comp(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], comp); +} + +/****************************************************************************** + * Ack Extended Transport Header + ******************************************************************************/ +struct rxe_aeth { + __be32 smsn; +}; + +#define AETH_SYN_MASK (0xff000000) +#define AETH_MSN_MASK (0x00ffffff) + +enum aeth_syndrome { + AETH_TYPE_MASK = 0xe0, + AETH_ACK = 0x00, + AETH_RNR_NAK = 0x20, + AETH_RSVD = 0x40, + AETH_NAK = 0x60, + AETH_ACK_UNLIMITED = 0x1f, + AETH_NAK_PSN_SEQ_ERROR = 0x60, + AETH_NAK_INVALID_REQ = 0x61, + AETH_NAK_REM_ACC_ERR = 0x62, + AETH_NAK_REM_OP_ERR = 0x63, + AETH_NAK_INV_RD_REQ = 0x64, +}; + +static inline u8 __aeth_syn(void *arg) +{ + struct rxe_aeth *aeth = arg; + + return (AETH_SYN_MASK & be32_to_cpu(aeth->smsn)) >> 24; +} + +static inline void __aeth_set_syn(void *arg, u8 syn) +{ + struct rxe_aeth *aeth = arg; + u32 smsn = be32_to_cpu(aeth->smsn); + + aeth->smsn = cpu_to_be32((AETH_SYN_MASK & (syn << 24)) | + (~AETH_SYN_MASK & smsn)); +} + +static inline u32 __aeth_msn(void *arg) +{ + struct rxe_aeth *aeth = arg; + + return AETH_MSN_MASK & be32_to_cpu(aeth->smsn); +} + +static inline void __aeth_set_msn(void *arg, u32 msn) +{ + struct rxe_aeth *aeth = arg; + u32 smsn = be32_to_cpu(aeth->smsn); + + aeth->smsn = cpu_to_be32((AETH_MSN_MASK & msn) | + (~AETH_MSN_MASK & smsn)); +} + +static inline u8 aeth_syn(struct rxe_pkt_info *pkt) +{ + return __aeth_syn(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_AETH]); +} + +static inline void aeth_set_syn(struct rxe_pkt_info *pkt, u8 syn) +{ + __aeth_set_syn(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_AETH], syn); +} + +static inline u32 aeth_msn(struct rxe_pkt_info *pkt) +{ + return __aeth_msn(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_AETH]); +} + +static inline void aeth_set_msn(struct rxe_pkt_info *pkt, u32 msn) +{ + __aeth_set_msn(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_AETH], msn); +} + +/****************************************************************************** + * Atomic Ack Extended Transport Header + ******************************************************************************/ +struct rxe_atmack { + __be64 orig; +}; + +static inline u64 __atmack_orig(void *arg) +{ + struct rxe_atmack *atmack = arg; + + return be64_to_cpu(atmack->orig); +} + +static inline void __atmack_set_orig(void *arg, u64 orig) +{ + struct rxe_atmack *atmack = arg; + + atmack->orig = cpu_to_be64(orig); +} + +static inline u64 atmack_orig(struct rxe_pkt_info *pkt) +{ + return __atmack_orig(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMACK]); +} + +static inline void atmack_set_orig(struct rxe_pkt_info *pkt, u64 orig) +{ + __atmack_set_orig(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMACK], orig); +} + +/****************************************************************************** + * Immediate Extended Transport Header + ******************************************************************************/ +struct rxe_immdt { + __be32 imm; +}; + +static inline __be32 __immdt_imm(void *arg) +{ + struct rxe_immdt *immdt = arg; + + return immdt->imm; +} + +static inline void __immdt_set_imm(void *arg, __be32 imm) +{ + struct rxe_immdt *immdt = arg; + + immdt->imm = imm; +} + +static inline __be32 immdt_imm(struct rxe_pkt_info *pkt) +{ + return __immdt_imm(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_IMMDT]); +} + +static inline void immdt_set_imm(struct rxe_pkt_info *pkt, __be32 imm) +{ + __immdt_set_imm(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_IMMDT], imm); +} + +/****************************************************************************** + * Invalidate Extended Transport Header + ******************************************************************************/ +struct rxe_ieth { + __be32 rkey; +}; + +static inline u32 __ieth_rkey(void *arg) +{ + struct rxe_ieth *ieth = arg; + + return be32_to_cpu(ieth->rkey); +} + +static inline void __ieth_set_rkey(void *arg, u32 rkey) +{ + struct rxe_ieth *ieth = arg; + + ieth->rkey = cpu_to_be32(rkey); +} + +static inline u32 ieth_rkey(struct rxe_pkt_info *pkt) +{ + return __ieth_rkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_IETH]); +} + +static inline void ieth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey) +{ + __ieth_set_rkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_IETH], rkey); +} + +enum rxe_hdr_length { + RXE_BTH_BYTES = sizeof(struct rxe_bth), + RXE_DETH_BYTES = sizeof(struct rxe_deth), + RXE_IMMDT_BYTES = sizeof(struct rxe_immdt), + RXE_RETH_BYTES = sizeof(struct rxe_reth), + RXE_AETH_BYTES = sizeof(struct rxe_aeth), + RXE_ATMACK_BYTES = sizeof(struct rxe_atmack), + RXE_ATMETH_BYTES = sizeof(struct rxe_atmeth), + RXE_IETH_BYTES = sizeof(struct rxe_ieth), + RXE_RDETH_BYTES = sizeof(struct rxe_rdeth), +}; + +static inline size_t header_size(struct rxe_pkt_info *pkt) +{ + return pkt->offset + rxe_opcode[pkt->opcode].length; +} + +static inline void *payload_addr(struct rxe_pkt_info *pkt) +{ + return pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD]; +} + +static inline size_t payload_size(struct rxe_pkt_info *pkt) +{ + return pkt->paylen - rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD] + - bth_pad(pkt) - RXE_ICRC_SIZE; +} + +#endif /* RXE_HDR_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_icrc.c b/drivers/infiniband/sw/rxe/rxe_icrc.c new file mode 100644 index 000000000000..413b56b23a06 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_icrc.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +/* Compute a partial ICRC for all the IB transport headers. */ +u32 rxe_icrc_hdr(struct rxe_pkt_info *pkt, struct sk_buff *skb) +{ + unsigned int bth_offset = 0; + struct iphdr *ip4h = NULL; + struct ipv6hdr *ip6h = NULL; + struct udphdr *udph; + struct rxe_bth *bth; + int crc; + int length; + int hdr_size = sizeof(struct udphdr) + + (skb->protocol == htons(ETH_P_IP) ? + sizeof(struct iphdr) : sizeof(struct ipv6hdr)); + /* pseudo header buffer size is calculate using ipv6 header size since + * it is bigger than ipv4 + */ + u8 pshdr[sizeof(struct udphdr) + + sizeof(struct ipv6hdr) + + RXE_BTH_BYTES]; + + /* This seed is the result of computing a CRC with a seed of + * 0xfffffff and 8 bytes of 0xff representing a masked LRH. + */ + crc = 0xdebb20e3; + + if (skb->protocol == htons(ETH_P_IP)) { /* IPv4 */ + memcpy(pshdr, ip_hdr(skb), hdr_size); + ip4h = (struct iphdr *)pshdr; + udph = (struct udphdr *)(ip4h + 1); + + ip4h->ttl = 0xff; + ip4h->check = CSUM_MANGLED_0; + ip4h->tos = 0xff; + } else { /* IPv6 */ + memcpy(pshdr, ipv6_hdr(skb), hdr_size); + ip6h = (struct ipv6hdr *)pshdr; + udph = (struct udphdr *)(ip6h + 1); + + memset(ip6h->flow_lbl, 0xff, sizeof(ip6h->flow_lbl)); + ip6h->priority = 0xf; + ip6h->hop_limit = 0xff; + } + udph->check = CSUM_MANGLED_0; + + bth_offset += hdr_size; + + memcpy(&pshdr[bth_offset], pkt->hdr, RXE_BTH_BYTES); + bth = (struct rxe_bth *)&pshdr[bth_offset]; + + /* exclude bth.resv8a */ + bth->qpn |= cpu_to_be32(~BTH_QPN_MASK); + + length = hdr_size + RXE_BTH_BYTES; + crc = crc32_le(crc, pshdr, length); + + /* And finish to compute the CRC on the remainder of the headers. */ + crc = crc32_le(crc, pkt->hdr + RXE_BTH_BYTES, + rxe_opcode[pkt->opcode].length - RXE_BTH_BYTES); + return crc; +} diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h new file mode 100644 index 000000000000..4a5484ef604f --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_LOC_H +#define RXE_LOC_H + +/* rxe_av.c */ + +int rxe_av_chk_attr(struct rxe_dev *rxe, struct ib_ah_attr *attr); + +int rxe_av_from_attr(struct rxe_dev *rxe, u8 port_num, + struct rxe_av *av, struct ib_ah_attr *attr); + +int rxe_av_to_attr(struct rxe_dev *rxe, struct rxe_av *av, + struct ib_ah_attr *attr); + +int rxe_av_fill_ip_info(struct rxe_dev *rxe, + struct rxe_av *av, + struct ib_ah_attr *attr, + struct ib_gid_attr *sgid_attr, + union ib_gid *sgid); + +struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt); + +/* rxe_cq.c */ +int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, + int cqe, int comp_vector, struct ib_udata *udata); + +int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, + int comp_vector, struct ib_ucontext *context, + struct ib_udata *udata); + +int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe, struct ib_udata *udata); + +int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited); + +void rxe_cq_cleanup(void *arg); + +/* rxe_mcast.c */ +int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid, + struct rxe_mc_grp **grp_p); + +int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, + struct rxe_mc_grp *grp); + +int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, + union ib_gid *mgid); + +void rxe_drop_all_mcast_groups(struct rxe_qp *qp); + +void rxe_mc_cleanup(void *arg); + +/* rxe_mmap.c */ +struct rxe_mmap_info { + struct list_head pending_mmaps; + struct ib_ucontext *context; + struct kref ref; + void *obj; + + struct mminfo info; +}; + +void rxe_mmap_release(struct kref *ref); + +struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *dev, + u32 size, + struct ib_ucontext *context, + void *obj); + +int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); + +/* rxe_mr.c */ +enum copy_direction { + to_mem_obj, + from_mem_obj, +}; + +int rxe_mem_init_dma(struct rxe_dev *rxe, struct rxe_pd *pd, + int access, struct rxe_mem *mem); + +int rxe_mem_init_user(struct rxe_dev *rxe, struct rxe_pd *pd, u64 start, + u64 length, u64 iova, int access, struct ib_udata *udata, + struct rxe_mem *mr); + +int rxe_mem_init_fast(struct rxe_dev *rxe, struct rxe_pd *pd, + int max_pages, struct rxe_mem *mem); + +int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr, + int length, enum copy_direction dir, u32 *crcp); + +int copy_data(struct rxe_dev *rxe, struct rxe_pd *pd, int access, + struct rxe_dma_info *dma, void *addr, int length, + enum copy_direction dir, u32 *crcp); + +void *iova_to_vaddr(struct rxe_mem *mem, u64 iova, int length); + +enum lookup_type { + lookup_local, + lookup_remote, +}; + +struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key, + enum lookup_type type); + +int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length); + +int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem, + u64 *page, int num_pages, u64 iova); + +void rxe_mem_cleanup(void *arg); + +int advance_dma_data(struct rxe_dma_info *dma, unsigned int length); + +/* rxe_qp.c */ +int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init); + +int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, + struct ib_qp_init_attr *init, struct ib_udata *udata, + struct ib_pd *ibpd); + +int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init); + +int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_attr *attr, int mask); + +int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, + int mask, struct ib_udata *udata); + +int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask); + +void rxe_qp_error(struct rxe_qp *qp); + +void rxe_qp_destroy(struct rxe_qp *qp); + +void rxe_qp_cleanup(void *arg); + +static inline int qp_num(struct rxe_qp *qp) +{ + return qp->ibqp.qp_num; +} + +static inline enum ib_qp_type qp_type(struct rxe_qp *qp) +{ + return qp->ibqp.qp_type; +} + +static inline enum ib_qp_state qp_state(struct rxe_qp *qp) +{ + return qp->attr.qp_state; +} + +static inline int qp_mtu(struct rxe_qp *qp) +{ + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) + return qp->attr.path_mtu; + else + return RXE_PORT_MAX_MTU; +} + +static inline int rcv_wqe_size(int max_sge) +{ + return sizeof(struct rxe_recv_wqe) + + max_sge * sizeof(struct ib_sge); +} + +void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res); + +static inline void rxe_advance_resp_resource(struct rxe_qp *qp) +{ + qp->resp.res_head++; + if (unlikely(qp->resp.res_head == qp->attr.max_rd_atomic)) + qp->resp.res_head = 0; +} + +void retransmit_timer(unsigned long data); +void rnr_nak_timer(unsigned long data); + +void dump_qp(struct rxe_qp *qp); + +/* rxe_srq.c */ +#define IB_SRQ_INIT_MASK (~IB_SRQ_LIMIT) + +int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_attr *attr, enum ib_srq_attr_mask mask); + +int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_init_attr *init, + struct ib_ucontext *context, struct ib_udata *udata); + +int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, + struct ib_udata *udata); + +extern struct ib_dma_mapping_ops rxe_dma_mapping_ops; + +void rxe_release(struct kref *kref); + +int rxe_completer(void *arg); +int rxe_requester(void *arg); +int rxe_responder(void *arg); + +u32 rxe_icrc_hdr(struct rxe_pkt_info *pkt, struct sk_buff *skb); + +void rxe_resp_queue_pkt(struct rxe_dev *rxe, + struct rxe_qp *qp, struct sk_buff *skb); + +void rxe_comp_queue_pkt(struct rxe_dev *rxe, + struct rxe_qp *qp, struct sk_buff *skb); + +static inline unsigned wr_opcode_mask(int opcode, struct rxe_qp *qp) +{ + return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type]; +} + +static inline int rxe_xmit_packet(struct rxe_dev *rxe, struct rxe_qp *qp, + struct rxe_pkt_info *pkt, struct sk_buff *skb) +{ + int err; + int is_request = pkt->mask & RXE_REQ_MASK; + + if ((is_request && (qp->req.state != QP_STATE_READY)) || + (!is_request && (qp->resp.state != QP_STATE_READY))) { + pr_info("Packet dropped. QP is not in ready state\n"); + goto drop; + } + + if (pkt->mask & RXE_LOOPBACK_MASK) { + memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt)); + err = rxe->ifc_ops->loopback(skb); + } else { + err = rxe->ifc_ops->send(rxe, pkt, skb); + } + + if (err) { + rxe->xmit_errors++; + return err; + } + + atomic_inc(&qp->skb_out); + + if ((qp_type(qp) != IB_QPT_RC) && + (pkt->mask & RXE_END_MASK)) { + pkt->wqe->state = wqe_state_done; + rxe_run_task(&qp->comp.task, 1); + } + + goto done; + +drop: + kfree_skb(skb); + err = 0; +done: + return err; +} + +#endif /* RXE_LOC_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c new file mode 100644 index 000000000000..fa95544ca7e0 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid, + struct rxe_mc_grp **grp_p) +{ + int err; + struct rxe_mc_grp *grp; + + if (rxe->attr.max_mcast_qp_attach == 0) { + err = -EINVAL; + goto err1; + } + + grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid); + if (grp) + goto done; + + grp = rxe_alloc(&rxe->mc_grp_pool); + if (!grp) { + err = -ENOMEM; + goto err1; + } + + INIT_LIST_HEAD(&grp->qp_list); + spin_lock_init(&grp->mcg_lock); + grp->rxe = rxe; + + rxe_add_key(grp, mgid); + + err = rxe->ifc_ops->mcast_add(rxe, mgid); + if (err) + goto err2; + +done: + *grp_p = grp; + return 0; + +err2: + rxe_drop_ref(grp); +err1: + return err; +} + +int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, + struct rxe_mc_grp *grp) +{ + int err; + struct rxe_mc_elem *elem; + + /* check to see of the qp is already a member of the group */ + spin_lock_bh(&qp->grp_lock); + spin_lock_bh(&grp->mcg_lock); + list_for_each_entry(elem, &grp->qp_list, qp_list) { + if (elem->qp == qp) { + err = 0; + goto out; + } + } + + if (grp->num_qp >= rxe->attr.max_mcast_qp_attach) { + err = -ENOMEM; + goto out; + } + + elem = rxe_alloc(&rxe->mc_elem_pool); + if (!elem) { + err = -ENOMEM; + goto out; + } + + /* each qp holds a ref on the grp */ + rxe_add_ref(grp); + + grp->num_qp++; + elem->qp = qp; + elem->grp = grp; + + list_add(&elem->qp_list, &grp->qp_list); + list_add(&elem->grp_list, &qp->grp_list); + + err = 0; +out: + spin_unlock_bh(&grp->mcg_lock); + spin_unlock_bh(&qp->grp_lock); + return err; +} + +int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, + union ib_gid *mgid) +{ + struct rxe_mc_grp *grp; + struct rxe_mc_elem *elem, *tmp; + + grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid); + if (!grp) + goto err1; + + spin_lock_bh(&qp->grp_lock); + spin_lock_bh(&grp->mcg_lock); + + list_for_each_entry_safe(elem, tmp, &grp->qp_list, qp_list) { + if (elem->qp == qp) { + list_del(&elem->qp_list); + list_del(&elem->grp_list); + grp->num_qp--; + + spin_unlock_bh(&grp->mcg_lock); + spin_unlock_bh(&qp->grp_lock); + rxe_drop_ref(elem); + rxe_drop_ref(grp); /* ref held by QP */ + rxe_drop_ref(grp); /* ref from get_key */ + return 0; + } + } + + spin_unlock_bh(&grp->mcg_lock); + spin_unlock_bh(&qp->grp_lock); + rxe_drop_ref(grp); /* ref from get_key */ +err1: + return -EINVAL; +} + +void rxe_drop_all_mcast_groups(struct rxe_qp *qp) +{ + struct rxe_mc_grp *grp; + struct rxe_mc_elem *elem; + + while (1) { + spin_lock_bh(&qp->grp_lock); + if (list_empty(&qp->grp_list)) { + spin_unlock_bh(&qp->grp_lock); + break; + } + elem = list_first_entry(&qp->grp_list, struct rxe_mc_elem, + grp_list); + list_del(&elem->grp_list); + spin_unlock_bh(&qp->grp_lock); + + grp = elem->grp; + spin_lock_bh(&grp->mcg_lock); + list_del(&elem->qp_list); + grp->num_qp--; + spin_unlock_bh(&grp->mcg_lock); + rxe_drop_ref(grp); + rxe_drop_ref(elem); + } +} + +void rxe_mc_cleanup(void *arg) +{ + struct rxe_mc_grp *grp = arg; + struct rxe_dev *rxe = grp->rxe; + + rxe_drop_key(grp); + rxe->ifc_ops->mcast_delete(rxe, &grp->mgid); +} diff --git a/drivers/infiniband/sw/rxe/rxe_mmap.c b/drivers/infiniband/sw/rxe/rxe_mmap.c new file mode 100644 index 000000000000..54b3c7c99eff --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_mmap.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/errno.h> +#include <asm/pgtable.h> + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +void rxe_mmap_release(struct kref *ref) +{ + struct rxe_mmap_info *ip = container_of(ref, + struct rxe_mmap_info, ref); + struct rxe_dev *rxe = to_rdev(ip->context->device); + + spin_lock_bh(&rxe->pending_lock); + + if (!list_empty(&ip->pending_mmaps)) + list_del(&ip->pending_mmaps); + + spin_unlock_bh(&rxe->pending_lock); + + vfree(ip->obj); /* buf */ + kfree(ip); +} + +/* + * open and close keep track of how many times the memory region is mapped, + * to avoid releasing it. + */ +static void rxe_vma_open(struct vm_area_struct *vma) +{ + struct rxe_mmap_info *ip = vma->vm_private_data; + + kref_get(&ip->ref); +} + +static void rxe_vma_close(struct vm_area_struct *vma) +{ + struct rxe_mmap_info *ip = vma->vm_private_data; + + kref_put(&ip->ref, rxe_mmap_release); +} + +static struct vm_operations_struct rxe_vm_ops = { + .open = rxe_vma_open, + .close = rxe_vma_close, +}; + +/** + * rxe_mmap - create a new mmap region + * @context: the IB user context of the process making the mmap() call + * @vma: the VMA to be initialized + * Return zero if the mmap is OK. Otherwise, return an errno. + */ +int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct rxe_dev *rxe = to_rdev(context->device); + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long size = vma->vm_end - vma->vm_start; + struct rxe_mmap_info *ip, *pp; + int ret; + + /* + * Search the device's list of objects waiting for a mmap call. + * Normally, this list is very short since a call to create a + * CQ, QP, or SRQ is soon followed by a call to mmap(). + */ + spin_lock_bh(&rxe->pending_lock); + list_for_each_entry_safe(ip, pp, &rxe->pending_mmaps, pending_mmaps) { + if (context != ip->context || (__u64)offset != ip->info.offset) + continue; + + /* Don't allow a mmap larger than the object. */ + if (size > ip->info.size) { + pr_err("mmap region is larger than the object!\n"); + spin_unlock_bh(&rxe->pending_lock); + ret = -EINVAL; + goto done; + } + + goto found_it; + } + pr_warn("unable to find pending mmap info\n"); + spin_unlock_bh(&rxe->pending_lock); + ret = -EINVAL; + goto done; + +found_it: + list_del_init(&ip->pending_mmaps); + spin_unlock_bh(&rxe->pending_lock); + + ret = remap_vmalloc_range(vma, ip->obj, 0); + if (ret) { + pr_err("rxe: err %d from remap_vmalloc_range\n", ret); + goto done; + } + + vma->vm_ops = &rxe_vm_ops; + vma->vm_private_data = ip; + rxe_vma_open(vma); +done: + return ret; +} + +/* + * Allocate information for rxe_mmap + */ +struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *rxe, + u32 size, + struct ib_ucontext *context, + void *obj) +{ + struct rxe_mmap_info *ip; + + ip = kmalloc(sizeof(*ip), GFP_KERNEL); + if (!ip) + return NULL; + + size = PAGE_ALIGN(size); + + spin_lock_bh(&rxe->mmap_offset_lock); + + if (rxe->mmap_offset == 0) + rxe->mmap_offset = PAGE_SIZE; + + ip->info.offset = rxe->mmap_offset; + rxe->mmap_offset += size; + + spin_unlock_bh(&rxe->mmap_offset_lock); + + INIT_LIST_HEAD(&ip->pending_mmaps); + ip->info.size = size; + ip->context = context; + ip->obj = obj; + kref_init(&ip->ref); + + return ip; +} diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c new file mode 100644 index 000000000000..f3dab6574504 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -0,0 +1,643 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +/* + * lfsr (linear feedback shift register) with period 255 + */ +static u8 rxe_get_key(void) +{ + static unsigned key = 1; + + key = key << 1; + + key |= (0 != (key & 0x100)) ^ (0 != (key & 0x10)) + ^ (0 != (key & 0x80)) ^ (0 != (key & 0x40)); + + key &= 0xff; + + return key; +} + +int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length) +{ + switch (mem->type) { + case RXE_MEM_TYPE_DMA: + return 0; + + case RXE_MEM_TYPE_MR: + case RXE_MEM_TYPE_FMR: + return ((iova < mem->iova) || + ((iova + length) > (mem->iova + mem->length))) ? + -EFAULT : 0; + + default: + return -EFAULT; + } +} + +#define IB_ACCESS_REMOTE (IB_ACCESS_REMOTE_READ \ + | IB_ACCESS_REMOTE_WRITE \ + | IB_ACCESS_REMOTE_ATOMIC) + +static void rxe_mem_init(int access, struct rxe_mem *mem) +{ + u32 lkey = mem->pelem.index << 8 | rxe_get_key(); + u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0; + + if (mem->pelem.pool->type == RXE_TYPE_MR) { + mem->ibmr.lkey = lkey; + mem->ibmr.rkey = rkey; + } + + mem->lkey = lkey; + mem->rkey = rkey; + mem->state = RXE_MEM_STATE_INVALID; + mem->type = RXE_MEM_TYPE_NONE; + mem->map_shift = ilog2(RXE_BUF_PER_MAP); +} + +void rxe_mem_cleanup(void *arg) +{ + struct rxe_mem *mem = arg; + int i; + + if (mem->umem) + ib_umem_release(mem->umem); + + if (mem->map) { + for (i = 0; i < mem->num_map; i++) + kfree(mem->map[i]); + + kfree(mem->map); + } +} + +static int rxe_mem_alloc(struct rxe_dev *rxe, struct rxe_mem *mem, int num_buf) +{ + int i; + int num_map; + struct rxe_map **map = mem->map; + + num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP; + + mem->map = kmalloc_array(num_map, sizeof(*map), GFP_KERNEL); + if (!mem->map) + goto err1; + + for (i = 0; i < num_map; i++) { + mem->map[i] = kmalloc(sizeof(**map), GFP_KERNEL); + if (!mem->map[i]) + goto err2; + } + + WARN_ON(!is_power_of_2(RXE_BUF_PER_MAP)); + + mem->map_shift = ilog2(RXE_BUF_PER_MAP); + mem->map_mask = RXE_BUF_PER_MAP - 1; + + mem->num_buf = num_buf; + mem->num_map = num_map; + mem->max_buf = num_map * RXE_BUF_PER_MAP; + + return 0; + +err2: + for (i--; i >= 0; i--) + kfree(mem->map[i]); + + kfree(mem->map); +err1: + return -ENOMEM; +} + +int rxe_mem_init_dma(struct rxe_dev *rxe, struct rxe_pd *pd, + int access, struct rxe_mem *mem) +{ + rxe_mem_init(access, mem); + + mem->pd = pd; + mem->access = access; + mem->state = RXE_MEM_STATE_VALID; + mem->type = RXE_MEM_TYPE_DMA; + + return 0; +} + +int rxe_mem_init_user(struct rxe_dev *rxe, struct rxe_pd *pd, u64 start, + u64 length, u64 iova, int access, struct ib_udata *udata, + struct rxe_mem *mem) +{ + int entry; + struct rxe_map **map; + struct rxe_phys_buf *buf = NULL; + struct ib_umem *umem; + struct scatterlist *sg; + int num_buf; + void *vaddr; + int err; + + umem = ib_umem_get(pd->ibpd.uobject->context, start, length, access, 0); + if (IS_ERR(umem)) { + pr_warn("err %d from rxe_umem_get\n", + (int)PTR_ERR(umem)); + err = -EINVAL; + goto err1; + } + + mem->umem = umem; + num_buf = umem->nmap; + + rxe_mem_init(access, mem); + + err = rxe_mem_alloc(rxe, mem, num_buf); + if (err) { + pr_warn("err %d from rxe_mem_alloc\n", err); + ib_umem_release(umem); + goto err1; + } + + WARN_ON(!is_power_of_2(umem->page_size)); + + mem->page_shift = ilog2(umem->page_size); + mem->page_mask = umem->page_size - 1; + + num_buf = 0; + map = mem->map; + if (length > 0) { + buf = map[0]->buf; + + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + vaddr = page_address(sg_page(sg)); + if (!vaddr) { + pr_warn("null vaddr\n"); + err = -ENOMEM; + goto err1; + } + + buf->addr = (uintptr_t)vaddr; + buf->size = umem->page_size; + num_buf++; + buf++; + + if (num_buf >= RXE_BUF_PER_MAP) { + map++; + buf = map[0]->buf; + num_buf = 0; + } + } + } + + mem->pd = pd; + mem->umem = umem; + mem->access = access; + mem->length = length; + mem->iova = iova; + mem->va = start; + mem->offset = ib_umem_offset(umem); + mem->state = RXE_MEM_STATE_VALID; + mem->type = RXE_MEM_TYPE_MR; + + return 0; + +err1: + return err; +} + +int rxe_mem_init_fast(struct rxe_dev *rxe, struct rxe_pd *pd, + int max_pages, struct rxe_mem *mem) +{ + int err; + + rxe_mem_init(0, mem); + + /* In fastreg, we also set the rkey */ + mem->ibmr.rkey = mem->ibmr.lkey; + + err = rxe_mem_alloc(rxe, mem, max_pages); + if (err) + goto err1; + + mem->pd = pd; + mem->max_buf = max_pages; + mem->state = RXE_MEM_STATE_FREE; + mem->type = RXE_MEM_TYPE_MR; + + return 0; + +err1: + return err; +} + +static void lookup_iova( + struct rxe_mem *mem, + u64 iova, + int *m_out, + int *n_out, + size_t *offset_out) +{ + size_t offset = iova - mem->iova + mem->offset; + int map_index; + int buf_index; + u64 length; + + if (likely(mem->page_shift)) { + *offset_out = offset & mem->page_mask; + offset >>= mem->page_shift; + *n_out = offset & mem->map_mask; + *m_out = offset >> mem->map_shift; + } else { + map_index = 0; + buf_index = 0; + + length = mem->map[map_index]->buf[buf_index].size; + + while (offset >= length) { + offset -= length; + buf_index++; + + if (buf_index == RXE_BUF_PER_MAP) { + map_index++; + buf_index = 0; + } + length = mem->map[map_index]->buf[buf_index].size; + } + + *m_out = map_index; + *n_out = buf_index; + *offset_out = offset; + } +} + +void *iova_to_vaddr(struct rxe_mem *mem, u64 iova, int length) +{ + size_t offset; + int m, n; + void *addr; + + if (mem->state != RXE_MEM_STATE_VALID) { + pr_warn("mem not in valid state\n"); + addr = NULL; + goto out; + } + + if (!mem->map) { + addr = (void *)(uintptr_t)iova; + goto out; + } + + if (mem_check_range(mem, iova, length)) { + pr_warn("range violation\n"); + addr = NULL; + goto out; + } + + lookup_iova(mem, iova, &m, &n, &offset); + + if (offset + length > mem->map[m]->buf[n].size) { + pr_warn("crosses page boundary\n"); + addr = NULL; + goto out; + } + + addr = (void *)(uintptr_t)mem->map[m]->buf[n].addr + offset; + +out: + return addr; +} + +/* copy data from a range (vaddr, vaddr+length-1) to or from + * a mem object starting at iova. Compute incremental value of + * crc32 if crcp is not zero. caller must hold a reference to mem + */ +int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr, int length, + enum copy_direction dir, u32 *crcp) +{ + int err; + int bytes; + u8 *va; + struct rxe_map **map; + struct rxe_phys_buf *buf; + int m; + int i; + size_t offset; + u32 crc = crcp ? (*crcp) : 0; + + if (mem->type == RXE_MEM_TYPE_DMA) { + u8 *src, *dest; + + src = (dir == to_mem_obj) ? + addr : ((void *)(uintptr_t)iova); + + dest = (dir == to_mem_obj) ? + ((void *)(uintptr_t)iova) : addr; + + if (crcp) + *crcp = crc32_le(*crcp, src, length); + + memcpy(dest, src, length); + + return 0; + } + + WARN_ON(!mem->map); + + err = mem_check_range(mem, iova, length); + if (err) { + err = -EFAULT; + goto err1; + } + + lookup_iova(mem, iova, &m, &i, &offset); + + map = mem->map + m; + buf = map[0]->buf + i; + + while (length > 0) { + u8 *src, *dest; + + va = (u8 *)(uintptr_t)buf->addr + offset; + src = (dir == to_mem_obj) ? addr : va; + dest = (dir == to_mem_obj) ? va : addr; + + bytes = buf->size - offset; + + if (bytes > length) + bytes = length; + + if (crcp) + crc = crc32_le(crc, src, bytes); + + memcpy(dest, src, bytes); + + length -= bytes; + addr += bytes; + + offset = 0; + buf++; + i++; + + if (i == RXE_BUF_PER_MAP) { + i = 0; + map++; + buf = map[0]->buf; + } + } + + if (crcp) + *crcp = crc; + + return 0; + +err1: + return err; +} + +/* copy data in or out of a wqe, i.e. sg list + * under the control of a dma descriptor + */ +int copy_data( + struct rxe_dev *rxe, + struct rxe_pd *pd, + int access, + struct rxe_dma_info *dma, + void *addr, + int length, + enum copy_direction dir, + u32 *crcp) +{ + int bytes; + struct rxe_sge *sge = &dma->sge[dma->cur_sge]; + int offset = dma->sge_offset; + int resid = dma->resid; + struct rxe_mem *mem = NULL; + u64 iova; + int err; + + if (length == 0) + return 0; + + if (length > resid) { + err = -EINVAL; + goto err2; + } + + if (sge->length && (offset < sge->length)) { + mem = lookup_mem(pd, access, sge->lkey, lookup_local); + if (!mem) { + err = -EINVAL; + goto err1; + } + } + + while (length > 0) { + bytes = length; + + if (offset >= sge->length) { + if (mem) { + rxe_drop_ref(mem); + mem = NULL; + } + sge++; + dma->cur_sge++; + offset = 0; + + if (dma->cur_sge >= dma->num_sge) { + err = -ENOSPC; + goto err2; + } + + if (sge->length) { + mem = lookup_mem(pd, access, sge->lkey, + lookup_local); + if (!mem) { + err = -EINVAL; + goto err1; + } + } else { + continue; + } + } + + if (bytes > sge->length - offset) + bytes = sge->length - offset; + + if (bytes > 0) { + iova = sge->addr + offset; + + err = rxe_mem_copy(mem, iova, addr, bytes, dir, crcp); + if (err) + goto err2; + + offset += bytes; + resid -= bytes; + length -= bytes; + addr += bytes; + } + } + + dma->sge_offset = offset; + dma->resid = resid; + + if (mem) + rxe_drop_ref(mem); + + return 0; + +err2: + if (mem) + rxe_drop_ref(mem); +err1: + return err; +} + +int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) +{ + struct rxe_sge *sge = &dma->sge[dma->cur_sge]; + int offset = dma->sge_offset; + int resid = dma->resid; + + while (length) { + unsigned int bytes; + + if (offset >= sge->length) { + sge++; + dma->cur_sge++; + offset = 0; + if (dma->cur_sge >= dma->num_sge) + return -ENOSPC; + } + + bytes = length; + + if (bytes > sge->length - offset) + bytes = sge->length - offset; + + offset += bytes; + resid -= bytes; + length -= bytes; + } + + dma->sge_offset = offset; + dma->resid = resid; + + return 0; +} + +/* (1) find the mem (mr or mw) corresponding to lkey/rkey + * depending on lookup_type + * (2) verify that the (qp) pd matches the mem pd + * (3) verify that the mem can support the requested access + * (4) verify that mem state is valid + */ +struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key, + enum lookup_type type) +{ + struct rxe_mem *mem; + struct rxe_dev *rxe = to_rdev(pd->ibpd.device); + int index = key >> 8; + + if (index >= RXE_MIN_MR_INDEX && index <= RXE_MAX_MR_INDEX) { + mem = rxe_pool_get_index(&rxe->mr_pool, index); + if (!mem) + goto err1; + } else { + goto err1; + } + + if ((type == lookup_local && mem->lkey != key) || + (type == lookup_remote && mem->rkey != key)) + goto err2; + + if (mem->pd != pd) + goto err2; + + if (access && !(access & mem->access)) + goto err2; + + if (mem->state != RXE_MEM_STATE_VALID) + goto err2; + + return mem; + +err2: + rxe_drop_ref(mem); +err1: + return NULL; +} + +int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem, + u64 *page, int num_pages, u64 iova) +{ + int i; + int num_buf; + int err; + struct rxe_map **map; + struct rxe_phys_buf *buf; + int page_size; + + if (num_pages > mem->max_buf) { + err = -EINVAL; + goto err1; + } + + num_buf = 0; + page_size = 1 << mem->page_shift; + map = mem->map; + buf = map[0]->buf; + + for (i = 0; i < num_pages; i++) { + buf->addr = *page++; + buf->size = page_size; + buf++; + num_buf++; + + if (num_buf == RXE_BUF_PER_MAP) { + map++; + buf = map[0]->buf; + num_buf = 0; + } + } + + mem->iova = iova; + mem->va = iova; + mem->length = num_pages << mem->page_shift; + mem->state = RXE_MEM_STATE_VALID; + + return 0; + +err1: + return err; +} diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c new file mode 100644 index 000000000000..0b8d2ea8b41d --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -0,0 +1,708 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/netdevice.h> +#include <linux/if.h> +#include <linux/if_vlan.h> +#include <net/udp_tunnel.h> +#include <net/sch_generic.h> +#include <linux/netfilter.h> +#include <rdma/ib_addr.h> + +#include "rxe.h" +#include "rxe_net.h" +#include "rxe_loc.h" + +static LIST_HEAD(rxe_dev_list); +static spinlock_t dev_list_lock; /* spinlock for device list */ + +struct rxe_dev *net_to_rxe(struct net_device *ndev) +{ + struct rxe_dev *rxe; + struct rxe_dev *found = NULL; + + spin_lock_bh(&dev_list_lock); + list_for_each_entry(rxe, &rxe_dev_list, list) { + if (rxe->ndev == ndev) { + found = rxe; + break; + } + } + spin_unlock_bh(&dev_list_lock); + + return found; +} + +struct rxe_dev *get_rxe_by_name(const char* name) +{ + struct rxe_dev *rxe; + struct rxe_dev *found = NULL; + + spin_lock_bh(&dev_list_lock); + list_for_each_entry(rxe, &rxe_dev_list, list) { + if (!strcmp(name, rxe->ib_dev.name)) { + found = rxe; + break; + } + } + spin_unlock_bh(&dev_list_lock); + return found; +} + + +struct rxe_recv_sockets recv_sockets; + +static __be64 rxe_mac_to_eui64(struct net_device *ndev) +{ + unsigned char *mac_addr = ndev->dev_addr; + __be64 eui64; + unsigned char *dst = (unsigned char *)&eui64; + + dst[0] = mac_addr[0] ^ 2; + dst[1] = mac_addr[1]; + dst[2] = mac_addr[2]; + dst[3] = 0xff; + dst[4] = 0xfe; + dst[5] = mac_addr[3]; + dst[6] = mac_addr[4]; + dst[7] = mac_addr[5]; + + return eui64; +} + +static __be64 node_guid(struct rxe_dev *rxe) +{ + return rxe_mac_to_eui64(rxe->ndev); +} + +static __be64 port_guid(struct rxe_dev *rxe) +{ + return rxe_mac_to_eui64(rxe->ndev); +} + +static struct device *dma_device(struct rxe_dev *rxe) +{ + struct net_device *ndev; + + ndev = rxe->ndev; + + if (ndev->priv_flags & IFF_802_1Q_VLAN) + ndev = vlan_dev_real_dev(ndev); + + return ndev->dev.parent; +} + +static int mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) +{ + int err; + unsigned char ll_addr[ETH_ALEN]; + + ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); + err = dev_mc_add(rxe->ndev, ll_addr); + + return err; +} + +static int mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid) +{ + int err; + unsigned char ll_addr[ETH_ALEN]; + + ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); + err = dev_mc_del(rxe->ndev, ll_addr); + + return err; +} + +static struct dst_entry *rxe_find_route4(struct net_device *ndev, + struct in_addr *saddr, + struct in_addr *daddr) +{ + struct rtable *rt; + struct flowi4 fl = { { 0 } }; + + memset(&fl, 0, sizeof(fl)); + fl.flowi4_oif = ndev->ifindex; + memcpy(&fl.saddr, saddr, sizeof(*saddr)); + memcpy(&fl.daddr, daddr, sizeof(*daddr)); + fl.flowi4_proto = IPPROTO_UDP; + + rt = ip_route_output_key(&init_net, &fl); + if (IS_ERR(rt)) { + pr_err_ratelimited("no route to %pI4\n", &daddr->s_addr); + return NULL; + } + + return &rt->dst; +} + +#if IS_ENABLED(CONFIG_IPV6) +static struct dst_entry *rxe_find_route6(struct net_device *ndev, + struct in6_addr *saddr, + struct in6_addr *daddr) +{ + struct dst_entry *ndst; + struct flowi6 fl6 = { { 0 } }; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = ndev->ifindex; + memcpy(&fl6.saddr, saddr, sizeof(*saddr)); + memcpy(&fl6.daddr, daddr, sizeof(*daddr)); + fl6.flowi6_proto = IPPROTO_UDP; + + if (unlikely(ipv6_stub->ipv6_dst_lookup(sock_net(recv_sockets.sk6->sk), + recv_sockets.sk6->sk, &ndst, &fl6))) { + pr_err_ratelimited("no route to %pI6\n", daddr); + goto put; + } + + if (unlikely(ndst->error)) { + pr_err("no route to %pI6\n", daddr); + goto put; + } + + return ndst; +put: + dst_release(ndst); + return NULL; +} + +#else + +static struct dst_entry *rxe_find_route6(struct net_device *ndev, + struct in6_addr *saddr, + struct in6_addr *daddr) +{ + return NULL; +} + +#endif + +static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb) +{ + struct udphdr *udph; + struct net_device *ndev = skb->dev; + struct rxe_dev *rxe = net_to_rxe(ndev); + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + + if (!rxe) + goto drop; + + if (skb_linearize(skb)) { + pr_err("skb_linearize failed\n"); + goto drop; + } + + udph = udp_hdr(skb); + pkt->rxe = rxe; + pkt->port_num = 1; + pkt->hdr = (u8 *)(udph + 1); + pkt->mask = RXE_GRH_MASK; + pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph); + + return rxe_rcv(skb); +drop: + kfree_skb(skb); + return 0; +} + +static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port, + bool ipv6) +{ + int err; + struct socket *sock; + struct udp_port_cfg udp_cfg; + struct udp_tunnel_sock_cfg tnl_cfg; + + memset(&udp_cfg, 0, sizeof(udp_cfg)); + + if (ipv6) { + udp_cfg.family = AF_INET6; + udp_cfg.ipv6_v6only = 1; + } else { + udp_cfg.family = AF_INET; + } + + udp_cfg.local_udp_port = port; + + /* Create UDP socket */ + err = udp_sock_create(net, &udp_cfg, &sock); + if (err < 0) { + pr_err("failed to create udp socket. err = %d\n", err); + return ERR_PTR(err); + } + + tnl_cfg.sk_user_data = NULL; + tnl_cfg.encap_type = 1; + tnl_cfg.encap_rcv = rxe_udp_encap_recv; + tnl_cfg.encap_destroy = NULL; + + /* Setup UDP tunnel */ + setup_udp_tunnel_sock(net, sock, &tnl_cfg); + + return sock; +} + +static void rxe_release_udp_tunnel(struct socket *sk) +{ + udp_tunnel_sock_release(sk); +} + +static void prepare_udp_hdr(struct sk_buff *skb, __be16 src_port, + __be16 dst_port) +{ + struct udphdr *udph; + + __skb_push(skb, sizeof(*udph)); + skb_reset_transport_header(skb); + udph = udp_hdr(skb); + + udph->dest = dst_port; + udph->source = src_port; + udph->len = htons(skb->len); + udph->check = 0; +} + +static void prepare_ipv4_hdr(struct dst_entry *dst, struct sk_buff *skb, + __be32 saddr, __be32 daddr, __u8 proto, + __u8 tos, __u8 ttl, __be16 df, bool xnet) +{ + struct iphdr *iph; + + skb_scrub_packet(skb, xnet); + + skb_clear_hash(skb); + skb_dst_set(skb, dst); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + + iph = ip_hdr(skb); + + iph->version = IPVERSION; + iph->ihl = sizeof(struct iphdr) >> 2; + iph->frag_off = df; + iph->protocol = proto; + iph->tos = tos; + iph->daddr = daddr; + iph->saddr = saddr; + iph->ttl = ttl; + __ip_select_ident(dev_net(dst->dev), iph, + skb_shinfo(skb)->gso_segs ?: 1); + iph->tot_len = htons(skb->len); + ip_send_check(iph); +} + +static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb, + struct in6_addr *saddr, struct in6_addr *daddr, + __u8 proto, __u8 prio, __u8 ttl) +{ + struct ipv6hdr *ip6h; + + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED + | IPSKB_REROUTED); + skb_dst_set(skb, dst); + + __skb_push(skb, sizeof(*ip6h)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + ip6_flow_hdr(ip6h, prio, htonl(0)); + ip6h->payload_len = htons(skb->len); + ip6h->nexthdr = proto; + ip6h->hop_limit = ttl; + ip6h->daddr = *daddr; + ip6h->saddr = *saddr; + ip6h->payload_len = htons(skb->len - sizeof(*ip6h)); +} + +static int prepare4(struct rxe_dev *rxe, struct sk_buff *skb, struct rxe_av *av) +{ + struct dst_entry *dst; + bool xnet = false; + __be16 df = htons(IP_DF); + struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr; + struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr; + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + + dst = rxe_find_route4(rxe->ndev, saddr, daddr); + if (!dst) { + pr_err("Host not reachable\n"); + return -EHOSTUNREACH; + } + + if (!memcmp(saddr, daddr, sizeof(*daddr))) + pkt->mask |= RXE_LOOPBACK_MASK; + + prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT), + htons(ROCE_V2_UDP_DPORT)); + + prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP, + av->grh.traffic_class, av->grh.hop_limit, df, xnet); + return 0; +} + +static int prepare6(struct rxe_dev *rxe, struct sk_buff *skb, struct rxe_av *av) +{ + struct dst_entry *dst; + struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr; + struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr; + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + + dst = rxe_find_route6(rxe->ndev, saddr, daddr); + if (!dst) { + pr_err("Host not reachable\n"); + return -EHOSTUNREACH; + } + + if (!memcmp(saddr, daddr, sizeof(*daddr))) + pkt->mask |= RXE_LOOPBACK_MASK; + + prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT), + htons(ROCE_V2_UDP_DPORT)); + + prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP, + av->grh.traffic_class, + av->grh.hop_limit); + return 0; +} + +static int prepare(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + struct sk_buff *skb, u32 *crc) +{ + int err = 0; + struct rxe_av *av = rxe_get_av(pkt); + + if (av->network_type == RDMA_NETWORK_IPV4) + err = prepare4(rxe, skb, av); + else if (av->network_type == RDMA_NETWORK_IPV6) + err = prepare6(rxe, skb, av); + + *crc = rxe_icrc_hdr(pkt, skb); + + return err; +} + +static void rxe_skb_tx_dtor(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct rxe_qp *qp = sk->sk_user_data; + int skb_out = atomic_dec_return(&qp->skb_out); + + if (unlikely(qp->need_req_skb && + skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW)) + rxe_run_task(&qp->req.task, 1); +} + +static int send(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + struct sk_buff *skb) +{ + struct sk_buff *nskb; + struct rxe_av *av; + int err; + + av = rxe_get_av(pkt); + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + nskb->destructor = rxe_skb_tx_dtor; + nskb->sk = pkt->qp->sk->sk; + + if (av->network_type == RDMA_NETWORK_IPV4) { + err = ip_local_out(dev_net(skb_dst(skb)->dev), nskb->sk, nskb); + } else if (av->network_type == RDMA_NETWORK_IPV6) { + err = ip6_local_out(dev_net(skb_dst(skb)->dev), nskb->sk, nskb); + } else { + pr_err("Unknown layer 3 protocol: %d\n", av->network_type); + kfree_skb(nskb); + return -EINVAL; + } + + if (unlikely(net_xmit_eval(err))) { + pr_debug("error sending packet: %d\n", err); + return -EAGAIN; + } + + kfree_skb(skb); + + return 0; +} + +static int loopback(struct sk_buff *skb) +{ + return rxe_rcv(skb); +} + +static inline int addr_same(struct rxe_dev *rxe, struct rxe_av *av) +{ + return rxe->port.port_guid == av->grh.dgid.global.interface_id; +} + +static struct sk_buff *init_packet(struct rxe_dev *rxe, struct rxe_av *av, + int paylen, struct rxe_pkt_info *pkt) +{ + unsigned int hdr_len; + struct sk_buff *skb; + + if (av->network_type == RDMA_NETWORK_IPV4) + hdr_len = ETH_HLEN + sizeof(struct udphdr) + + sizeof(struct iphdr); + else + hdr_len = ETH_HLEN + sizeof(struct udphdr) + + sizeof(struct ipv6hdr); + + skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(rxe->ndev), + GFP_ATOMIC); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(rxe->ndev)); + + skb->dev = rxe->ndev; + if (av->network_type == RDMA_NETWORK_IPV4) + skb->protocol = htons(ETH_P_IP); + else + skb->protocol = htons(ETH_P_IPV6); + + pkt->rxe = rxe; + pkt->port_num = 1; + pkt->hdr = skb_put(skb, paylen); + pkt->mask |= RXE_GRH_MASK; + + memset(pkt->hdr, 0, paylen); + + return skb; +} + +/* + * this is required by rxe_cfg to match rxe devices in + * /sys/class/infiniband up with their underlying ethernet devices + */ +static char *parent_name(struct rxe_dev *rxe, unsigned int port_num) +{ + return rxe->ndev->name; +} + +static enum rdma_link_layer link_layer(struct rxe_dev *rxe, + unsigned int port_num) +{ + return IB_LINK_LAYER_ETHERNET; +} + +static struct rxe_ifc_ops ifc_ops = { + .node_guid = node_guid, + .port_guid = port_guid, + .dma_device = dma_device, + .mcast_add = mcast_add, + .mcast_delete = mcast_delete, + .prepare = prepare, + .send = send, + .loopback = loopback, + .init_packet = init_packet, + .parent_name = parent_name, + .link_layer = link_layer, +}; + +struct rxe_dev *rxe_net_add(struct net_device *ndev) +{ + int err; + struct rxe_dev *rxe = NULL; + + rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe)); + if (!rxe) + return NULL; + + rxe->ifc_ops = &ifc_ops; + rxe->ndev = ndev; + + err = rxe_add(rxe, ndev->mtu); + if (err) { + ib_dealloc_device(&rxe->ib_dev); + return NULL; + } + + spin_lock_bh(&dev_list_lock); + list_add_tail(&rxe_dev_list, &rxe->list); + spin_unlock_bh(&dev_list_lock); + return rxe; +} + +void rxe_remove_all(void) +{ + spin_lock_bh(&dev_list_lock); + while (!list_empty(&rxe_dev_list)) { + struct rxe_dev *rxe = + list_first_entry(&rxe_dev_list, struct rxe_dev, list); + + list_del(&rxe->list); + spin_unlock_bh(&dev_list_lock); + rxe_remove(rxe); + spin_lock_bh(&dev_list_lock); + } + spin_unlock_bh(&dev_list_lock); +} +EXPORT_SYMBOL(rxe_remove_all); + +static void rxe_port_event(struct rxe_dev *rxe, + enum ib_event_type event) +{ + struct ib_event ev; + + ev.device = &rxe->ib_dev; + ev.element.port_num = 1; + ev.event = event; + + ib_dispatch_event(&ev); +} + +/* Caller must hold net_info_lock */ +void rxe_port_up(struct rxe_dev *rxe) +{ + struct rxe_port *port; + + port = &rxe->port; + port->attr.state = IB_PORT_ACTIVE; + port->attr.phys_state = IB_PHYS_STATE_LINK_UP; + + rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE); + pr_info("rxe: set %s active\n", rxe->ib_dev.name); + return; +} + +/* Caller must hold net_info_lock */ +void rxe_port_down(struct rxe_dev *rxe) +{ + struct rxe_port *port; + + port = &rxe->port; + port->attr.state = IB_PORT_DOWN; + port->attr.phys_state = IB_PHYS_STATE_LINK_DOWN; + + rxe_port_event(rxe, IB_EVENT_PORT_ERR); + pr_info("rxe: set %s down\n", rxe->ib_dev.name); + return; +} + +static int rxe_notify(struct notifier_block *not_blk, + unsigned long event, + void *arg) +{ + struct net_device *ndev = netdev_notifier_info_to_dev(arg); + struct rxe_dev *rxe = net_to_rxe(ndev); + + if (!rxe) + goto out; + + switch (event) { + case NETDEV_UNREGISTER: + list_del(&rxe->list); + rxe_remove(rxe); + break; + case NETDEV_UP: + rxe_port_up(rxe); + break; + case NETDEV_DOWN: + rxe_port_down(rxe); + break; + case NETDEV_CHANGEMTU: + pr_info("rxe: %s changed mtu to %d\n", ndev->name, ndev->mtu); + rxe_set_mtu(rxe, ndev->mtu); + break; + case NETDEV_REBOOT: + case NETDEV_CHANGE: + case NETDEV_GOING_DOWN: + case NETDEV_CHANGEADDR: + case NETDEV_CHANGENAME: + case NETDEV_FEAT_CHANGE: + default: + pr_info("rxe: ignoring netdev event = %ld for %s\n", + event, ndev->name); + break; + } +out: + return NOTIFY_OK; +} + +static struct notifier_block rxe_net_notifier = { + .notifier_call = rxe_notify, +}; + +int rxe_net_init(void) +{ + int err; + + spin_lock_init(&dev_list_lock); + + recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net, + htons(ROCE_V2_UDP_DPORT), true); + if (IS_ERR(recv_sockets.sk6)) { + recv_sockets.sk6 = NULL; + pr_err("rxe: Failed to create IPv6 UDP tunnel\n"); + return -1; + } + + recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net, + htons(ROCE_V2_UDP_DPORT), false); + if (IS_ERR(recv_sockets.sk4)) { + rxe_release_udp_tunnel(recv_sockets.sk6); + recv_sockets.sk4 = NULL; + recv_sockets.sk6 = NULL; + pr_err("rxe: Failed to create IPv4 UDP tunnel\n"); + return -1; + } + + err = register_netdevice_notifier(&rxe_net_notifier); + if (err) { + rxe_release_udp_tunnel(recv_sockets.sk6); + rxe_release_udp_tunnel(recv_sockets.sk4); + pr_err("rxe: Failed to rigister netdev notifier\n"); + } + + return err; +} + +void rxe_net_exit(void) +{ + if (recv_sockets.sk6) + rxe_release_udp_tunnel(recv_sockets.sk6); + + if (recv_sockets.sk4) + rxe_release_udp_tunnel(recv_sockets.sk4); + + unregister_netdevice_notifier(&rxe_net_notifier); +} diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h new file mode 100644 index 000000000000..7b06f76d16cc --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_net.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_NET_H +#define RXE_NET_H + +#include <net/sock.h> +#include <net/if_inet6.h> +#include <linux/module.h> + +struct rxe_recv_sockets { + struct socket *sk4; + struct socket *sk6; +}; + +extern struct rxe_recv_sockets recv_sockets; + +struct rxe_dev *rxe_net_add(struct net_device *ndev); + +int rxe_net_init(void); +void rxe_net_exit(void); + +#endif /* RXE_NET_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c new file mode 100644 index 000000000000..61927c165b59 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_opcode.c @@ -0,0 +1,961 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/ib_pack.h> +#include "rxe_opcode.h" +#include "rxe_hdr.h" + +/* useful information about work request opcodes and pkt opcodes in + * table form + */ +struct rxe_wr_opcode_info rxe_wr_opcode_info[] = { + [IB_WR_RDMA_WRITE] = { + .name = "IB_WR_RDMA_WRITE", + .mask = { + [IB_QPT_RC] = WR_INLINE_MASK | WR_WRITE_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_WRITE_MASK, + }, + }, + [IB_WR_RDMA_WRITE_WITH_IMM] = { + .name = "IB_WR_RDMA_WRITE_WITH_IMM", + .mask = { + [IB_QPT_RC] = WR_INLINE_MASK | WR_WRITE_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_WRITE_MASK, + }, + }, + [IB_WR_SEND] = { + .name = "IB_WR_SEND", + .mask = { + [IB_QPT_SMI] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_GSI] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK, + }, + }, + [IB_WR_SEND_WITH_IMM] = { + .name = "IB_WR_SEND_WITH_IMM", + .mask = { + [IB_QPT_SMI] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_GSI] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK, + }, + }, + [IB_WR_RDMA_READ] = { + .name = "IB_WR_RDMA_READ", + .mask = { + [IB_QPT_RC] = WR_READ_MASK, + }, + }, + [IB_WR_ATOMIC_CMP_AND_SWP] = { + .name = "IB_WR_ATOMIC_CMP_AND_SWP", + .mask = { + [IB_QPT_RC] = WR_ATOMIC_MASK, + }, + }, + [IB_WR_ATOMIC_FETCH_AND_ADD] = { + .name = "IB_WR_ATOMIC_FETCH_AND_ADD", + .mask = { + [IB_QPT_RC] = WR_ATOMIC_MASK, + }, + }, + [IB_WR_LSO] = { + .name = "IB_WR_LSO", + .mask = { + /* not supported */ + }, + }, + [IB_WR_SEND_WITH_INV] = { + .name = "IB_WR_SEND_WITH_INV", + .mask = { + [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK, + }, + }, + [IB_WR_RDMA_READ_WITH_INV] = { + .name = "IB_WR_RDMA_READ_WITH_INV", + .mask = { + [IB_QPT_RC] = WR_READ_MASK, + }, + }, + [IB_WR_LOCAL_INV] = { + .name = "IB_WR_LOCAL_INV", + .mask = { + [IB_QPT_RC] = WR_REG_MASK, + }, + }, + [IB_WR_REG_MR] = { + .name = "IB_WR_REG_MR", + .mask = { + [IB_QPT_RC] = WR_REG_MASK, + }, + }, +}; + +struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = { + [IB_OPCODE_RC_SEND_FIRST] = { + .name = "IB_OPCODE_RC_SEND_FIRST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_RWR_MASK + | RXE_SEND_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_MIDDLE] = { + .name = "IB_OPCODE_RC_SEND_MIDDLE]", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_SEND_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_LAST] = { + .name = "IB_OPCODE_RC_SEND_LAST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK + | RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RC_SEND_ONLY] = { + .name = "IB_OPCODE_RC_SEND_ONLY", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK + | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_FIRST] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_FIRST", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_MIDDLE", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_LAST] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_LAST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_ONLY] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_ONLY", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_START_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE", + .mask = RXE_RETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_REQUEST] = { + .name = "IB_OPCODE_RC_RDMA_READ_REQUEST", + .mask = RXE_RETH_MASK | RXE_REQ_MASK | RXE_READ_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = { + .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST", + .mask = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK + | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = { + .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE", + .mask = RXE_PAYLOAD_MASK | RXE_ACK_MASK | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = { + .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST", + .mask = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = { + .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY", + .mask = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_ACKNOWLEDGE] = { + .name = "IB_OPCODE_RC_ACKNOWLEDGE", + .mask = RXE_AETH_MASK | RXE_ACK_MASK | RXE_START_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = { + .name = "IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE", + .mask = RXE_AETH_MASK | RXE_ATMACK_MASK | RXE_ACK_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMACK_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_ATMACK] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_ATMACK_BYTES + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_COMPARE_SWAP] = { + .name = "IB_OPCODE_RC_COMPARE_SWAP", + .mask = RXE_ATMETH_MASK | RXE_REQ_MASK | RXE_ATOMIC_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_ATMETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_ATMETH_BYTES, + } + }, + [IB_OPCODE_RC_FETCH_ADD] = { + .name = "IB_OPCODE_RC_FETCH_ADD", + .mask = RXE_ATMETH_MASK | RXE_REQ_MASK | RXE_ATOMIC_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_ATMETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_ATMETH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = { + .name = "IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE", + .mask = RXE_IETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IETH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = { + .name = "IB_OPCODE_RC_SEND_ONLY_INV", + .mask = RXE_IETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IETH_BYTES, + } + }, + + /* UC */ + [IB_OPCODE_UC_SEND_FIRST] = { + .name = "IB_OPCODE_UC_SEND_FIRST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_RWR_MASK + | RXE_SEND_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_SEND_MIDDLE] = { + .name = "IB_OPCODE_UC_SEND_MIDDLE", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_SEND_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_SEND_LAST] = { + .name = "IB_OPCODE_UC_SEND_LAST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK + | RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_UC_SEND_ONLY] = { + .name = "IB_OPCODE_UC_SEND_ONLY", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK + | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_FIRST] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_FIRST", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_MIDDLE", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_LAST] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_LAST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_ONLY] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_ONLY", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_START_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE", + .mask = RXE_RETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + + /* RD */ + [IB_OPCODE_RD_SEND_FIRST] = { + .name = "IB_OPCODE_RD_SEND_FIRST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_SEND_MIDDLE] = { + .name = "IB_OPCODE_RD_SEND_MIDDLE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_SEND_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_SEND_LAST] = { + .name = "IB_OPCODE_RD_SEND_LAST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_COMP_MASK | RXE_SEND_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK + | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_SEND_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RD_SEND_ONLY] = { + .name = "IB_OPCODE_RD_SEND_ONLY", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_SEND_MASK | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK + | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_FIRST] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_FIRST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK + | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_MIDDLE] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_MIDDLE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_LAST] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_LAST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK + | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_ONLY] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_ONLY", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK + | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_START_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK + | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES + + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_RETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_RETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_REQUEST] = { + .name = "IB_OPCODE_RD_RDMA_READ_REQUEST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK + | RXE_REQ_MASK | RXE_READ_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES + + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST] = { + .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK + | RXE_PAYLOAD_MASK | RXE_ACK_MASK + | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE] = { + .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE", + .mask = RXE_RDETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST] = { + .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_PAYLOAD_MASK + | RXE_ACK_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY] = { + .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_PAYLOAD_MASK + | RXE_ACK_MASK | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RD_ACKNOWLEDGE] = { + .name = "IB_OPCODE_RD_ACKNOWLEDGE", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_ACK_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + } + }, + [IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE] = { + .name = "IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_ATMACK_MASK + | RXE_ACK_MASK | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMACK_BYTES + RXE_AETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_ATMACK] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RD_COMPARE_SWAP] = { + .name = "RD_COMPARE_SWAP", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_ATMETH_MASK + | RXE_REQ_MASK | RXE_ATOMIC_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_ATMETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + + RXE_ATMETH_BYTES + + RXE_DETH_BYTES + + + RXE_RDETH_BYTES, + } + }, + [IB_OPCODE_RD_FETCH_ADD] = { + .name = "IB_OPCODE_RD_FETCH_ADD", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_ATMETH_MASK + | RXE_REQ_MASK | RXE_ATOMIC_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_ATMETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + + RXE_ATMETH_BYTES + + RXE_DETH_BYTES + + + RXE_RDETH_BYTES, + } + }, + + /* UD */ + [IB_OPCODE_UD_SEND_ONLY] = { + .name = "IB_OPCODE_UD_SEND_ONLY", + .mask = RXE_DETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_DETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE", + .mask = RXE_DETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_SEND_MASK | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_DETH] = RXE_BTH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_DETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + +}; diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h new file mode 100644 index 000000000000..307604e9c78d --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_opcode.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_OPCODE_H +#define RXE_OPCODE_H + +/* + * contains header bit mask definitions and header lengths + * declaration of the rxe_opcode_info struct and + * rxe_wr_opcode_info struct + */ + +enum rxe_wr_mask { + WR_INLINE_MASK = BIT(0), + WR_ATOMIC_MASK = BIT(1), + WR_SEND_MASK = BIT(2), + WR_READ_MASK = BIT(3), + WR_WRITE_MASK = BIT(4), + WR_LOCAL_MASK = BIT(5), + WR_REG_MASK = BIT(6), + + WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK, + WR_READ_WRITE_OR_SEND_MASK = WR_READ_OR_WRITE_MASK | WR_SEND_MASK, + WR_WRITE_OR_SEND_MASK = WR_WRITE_MASK | WR_SEND_MASK, + WR_ATOMIC_OR_READ_MASK = WR_ATOMIC_MASK | WR_READ_MASK, +}; + +#define WR_MAX_QPT (8) + +struct rxe_wr_opcode_info { + char *name; + enum rxe_wr_mask mask[WR_MAX_QPT]; +}; + +extern struct rxe_wr_opcode_info rxe_wr_opcode_info[]; + +enum rxe_hdr_type { + RXE_LRH, + RXE_GRH, + RXE_BTH, + RXE_RETH, + RXE_AETH, + RXE_ATMETH, + RXE_ATMACK, + RXE_IETH, + RXE_RDETH, + RXE_DETH, + RXE_IMMDT, + RXE_PAYLOAD, + NUM_HDR_TYPES +}; + +enum rxe_hdr_mask { + RXE_LRH_MASK = BIT(RXE_LRH), + RXE_GRH_MASK = BIT(RXE_GRH), + RXE_BTH_MASK = BIT(RXE_BTH), + RXE_IMMDT_MASK = BIT(RXE_IMMDT), + RXE_RETH_MASK = BIT(RXE_RETH), + RXE_AETH_MASK = BIT(RXE_AETH), + RXE_ATMETH_MASK = BIT(RXE_ATMETH), + RXE_ATMACK_MASK = BIT(RXE_ATMACK), + RXE_IETH_MASK = BIT(RXE_IETH), + RXE_RDETH_MASK = BIT(RXE_RDETH), + RXE_DETH_MASK = BIT(RXE_DETH), + RXE_PAYLOAD_MASK = BIT(RXE_PAYLOAD), + + RXE_REQ_MASK = BIT(NUM_HDR_TYPES + 0), + RXE_ACK_MASK = BIT(NUM_HDR_TYPES + 1), + RXE_SEND_MASK = BIT(NUM_HDR_TYPES + 2), + RXE_WRITE_MASK = BIT(NUM_HDR_TYPES + 3), + RXE_READ_MASK = BIT(NUM_HDR_TYPES + 4), + RXE_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5), + + RXE_RWR_MASK = BIT(NUM_HDR_TYPES + 6), + RXE_COMP_MASK = BIT(NUM_HDR_TYPES + 7), + + RXE_START_MASK = BIT(NUM_HDR_TYPES + 8), + RXE_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 9), + RXE_END_MASK = BIT(NUM_HDR_TYPES + 10), + + RXE_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), + + RXE_READ_OR_ATOMIC = (RXE_READ_MASK | RXE_ATOMIC_MASK), + RXE_WRITE_OR_SEND = (RXE_WRITE_MASK | RXE_SEND_MASK), +}; + +#define OPCODE_NONE (-1) +#define RXE_NUM_OPCODE 256 + +struct rxe_opcode_info { + char *name; + enum rxe_hdr_mask mask; + int length; + int offset[NUM_HDR_TYPES]; +}; + +extern struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE]; + +#endif /* RXE_OPCODE_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h new file mode 100644 index 000000000000..f459c43a77c8 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_PARAM_H +#define RXE_PARAM_H + +static inline enum ib_mtu rxe_mtu_int_to_enum(int mtu) +{ + if (mtu < 256) + return 0; + else if (mtu < 512) + return IB_MTU_256; + else if (mtu < 1024) + return IB_MTU_512; + else if (mtu < 2048) + return IB_MTU_1024; + else if (mtu < 4096) + return IB_MTU_2048; + else + return IB_MTU_4096; +} + +/* Find the IB mtu for a given network MTU. */ +static inline enum ib_mtu eth_mtu_int_to_enum(int mtu) +{ + mtu -= RXE_MAX_HDR_LENGTH; + + return rxe_mtu_int_to_enum(mtu); +} + +/* default/initial rxe device parameter settings */ +enum rxe_device_param { + RXE_FW_VER = 0, + RXE_MAX_MR_SIZE = -1ull, + RXE_PAGE_SIZE_CAP = 0xfffff000, + RXE_VENDOR_ID = 0, + RXE_VENDOR_PART_ID = 0, + RXE_HW_VER = 0, + RXE_MAX_QP = 0x10000, + RXE_MAX_QP_WR = 0x4000, + RXE_MAX_INLINE_DATA = 400, + RXE_DEVICE_CAP_FLAGS = IB_DEVICE_BAD_PKEY_CNTR + | IB_DEVICE_BAD_QKEY_CNTR + | IB_DEVICE_AUTO_PATH_MIG + | IB_DEVICE_CHANGE_PHY_PORT + | IB_DEVICE_UD_AV_PORT_ENFORCE + | IB_DEVICE_PORT_ACTIVE_EVENT + | IB_DEVICE_SYS_IMAGE_GUID + | IB_DEVICE_RC_RNR_NAK_GEN + | IB_DEVICE_SRQ_RESIZE + | IB_DEVICE_MEM_MGT_EXTENSIONS, + RXE_MAX_SGE = 32, + RXE_MAX_SGE_RD = 32, + RXE_MAX_CQ = 16384, + RXE_MAX_LOG_CQE = 13, + RXE_MAX_MR = 2 * 1024, + RXE_MAX_PD = 0x7ffc, + RXE_MAX_QP_RD_ATOM = 128, + RXE_MAX_EE_RD_ATOM = 0, + RXE_MAX_RES_RD_ATOM = 0x3f000, + RXE_MAX_QP_INIT_RD_ATOM = 128, + RXE_MAX_EE_INIT_RD_ATOM = 0, + RXE_ATOMIC_CAP = 1, + RXE_MAX_EE = 0, + RXE_MAX_RDD = 0, + RXE_MAX_MW = 0, + RXE_MAX_RAW_IPV6_QP = 0, + RXE_MAX_RAW_ETHY_QP = 0, + RXE_MAX_MCAST_GRP = 8192, + RXE_MAX_MCAST_QP_ATTACH = 56, + RXE_MAX_TOT_MCAST_QP_ATTACH = 0x70000, + RXE_MAX_AH = 100, + RXE_MAX_FMR = 0, + RXE_MAX_MAP_PER_FMR = 0, + RXE_MAX_SRQ = 960, + RXE_MAX_SRQ_WR = 0x4000, + RXE_MIN_SRQ_WR = 1, + RXE_MAX_SRQ_SGE = 27, + RXE_MIN_SRQ_SGE = 1, + RXE_MAX_FMR_PAGE_LIST_LEN = 512, + RXE_MAX_PKEYS = 64, + RXE_LOCAL_CA_ACK_DELAY = 15, + + RXE_MAX_UCONTEXT = 512, + + RXE_NUM_PORT = 1, + RXE_NUM_COMP_VECTORS = 1, + + RXE_MIN_QP_INDEX = 16, + RXE_MAX_QP_INDEX = 0x00020000, + + RXE_MIN_SRQ_INDEX = 0x00020001, + RXE_MAX_SRQ_INDEX = 0x00040000, + + RXE_MIN_MR_INDEX = 0x00000001, + RXE_MAX_MR_INDEX = 0x00040000, + RXE_MIN_MW_INDEX = 0x00040001, + RXE_MAX_MW_INDEX = 0x00060000, + RXE_MAX_PKT_PER_ACK = 64, + + RXE_MAX_UNACKED_PSNS = 128, + + /* Max inflight SKBs per queue pair */ + RXE_INFLIGHT_SKBS_PER_QP_HIGH = 64, + RXE_INFLIGHT_SKBS_PER_QP_LOW = 16, + + /* Delay before calling arbiter timer */ + RXE_NSEC_ARB_TIMER_DELAY = 200, +}; + +/* default/initial rxe port parameters */ +enum rxe_port_param { + RXE_PORT_STATE = IB_PORT_DOWN, + RXE_PORT_MAX_MTU = IB_MTU_4096, + RXE_PORT_ACTIVE_MTU = IB_MTU_256, + RXE_PORT_GID_TBL_LEN = 1024, + RXE_PORT_PORT_CAP_FLAGS = RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP, + RXE_PORT_MAX_MSG_SZ = 0x800000, + RXE_PORT_BAD_PKEY_CNTR = 0, + RXE_PORT_QKEY_VIOL_CNTR = 0, + RXE_PORT_LID = 0, + RXE_PORT_SM_LID = 0, + RXE_PORT_SM_SL = 0, + RXE_PORT_LMC = 0, + RXE_PORT_MAX_VL_NUM = 1, + RXE_PORT_SUBNET_TIMEOUT = 0, + RXE_PORT_INIT_TYPE_REPLY = 0, + RXE_PORT_ACTIVE_WIDTH = IB_WIDTH_1X, + RXE_PORT_ACTIVE_SPEED = 1, + RXE_PORT_PKEY_TBL_LEN = 64, + RXE_PORT_PHYS_STATE = 2, + RXE_PORT_SUBNET_PREFIX = 0xfe80000000000000ULL, +}; + +/* default/initial port info parameters */ +enum rxe_port_info_param { + RXE_PORT_INFO_VL_CAP = 4, /* 1-8 */ + RXE_PORT_INFO_MTU_CAP = 5, /* 4096 */ + RXE_PORT_INFO_OPER_VL = 1, /* 1 */ +}; + +#endif /* RXE_PARAM_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c new file mode 100644 index 000000000000..6bac0717c540 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -0,0 +1,502 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +/* info about object pools + * note that mr and mw share a single index space + * so that one can map an lkey to the correct type of object + */ +struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = { + [RXE_TYPE_UC] = { + .name = "rxe-uc", + .size = sizeof(struct rxe_ucontext), + }, + [RXE_TYPE_PD] = { + .name = "rxe-pd", + .size = sizeof(struct rxe_pd), + }, + [RXE_TYPE_AH] = { + .name = "rxe-ah", + .size = sizeof(struct rxe_ah), + .flags = RXE_POOL_ATOMIC, + }, + [RXE_TYPE_SRQ] = { + .name = "rxe-srq", + .size = sizeof(struct rxe_srq), + .flags = RXE_POOL_INDEX, + .min_index = RXE_MIN_SRQ_INDEX, + .max_index = RXE_MAX_SRQ_INDEX, + }, + [RXE_TYPE_QP] = { + .name = "rxe-qp", + .size = sizeof(struct rxe_qp), + .cleanup = rxe_qp_cleanup, + .flags = RXE_POOL_INDEX, + .min_index = RXE_MIN_QP_INDEX, + .max_index = RXE_MAX_QP_INDEX, + }, + [RXE_TYPE_CQ] = { + .name = "rxe-cq", + .size = sizeof(struct rxe_cq), + .cleanup = rxe_cq_cleanup, + }, + [RXE_TYPE_MR] = { + .name = "rxe-mr", + .size = sizeof(struct rxe_mem), + .cleanup = rxe_mem_cleanup, + .flags = RXE_POOL_INDEX, + .max_index = RXE_MAX_MR_INDEX, + .min_index = RXE_MIN_MR_INDEX, + }, + [RXE_TYPE_MW] = { + .name = "rxe-mw", + .size = sizeof(struct rxe_mem), + .flags = RXE_POOL_INDEX, + .max_index = RXE_MAX_MW_INDEX, + .min_index = RXE_MIN_MW_INDEX, + }, + [RXE_TYPE_MC_GRP] = { + .name = "rxe-mc_grp", + .size = sizeof(struct rxe_mc_grp), + .cleanup = rxe_mc_cleanup, + .flags = RXE_POOL_KEY, + .key_offset = offsetof(struct rxe_mc_grp, mgid), + .key_size = sizeof(union ib_gid), + }, + [RXE_TYPE_MC_ELEM] = { + .name = "rxe-mc_elem", + .size = sizeof(struct rxe_mc_elem), + .flags = RXE_POOL_ATOMIC, + }, +}; + +static inline char *pool_name(struct rxe_pool *pool) +{ + return rxe_type_info[pool->type].name; +} + +static inline struct kmem_cache *pool_cache(struct rxe_pool *pool) +{ + return rxe_type_info[pool->type].cache; +} + +static inline enum rxe_elem_type rxe_type(void *arg) +{ + struct rxe_pool_entry *elem = arg; + + return elem->pool->type; +} + +int rxe_cache_init(void) +{ + int err; + int i; + size_t size; + struct rxe_type_info *type; + + for (i = 0; i < RXE_NUM_TYPES; i++) { + type = &rxe_type_info[i]; + size = ALIGN(type->size, RXE_POOL_ALIGN); + type->cache = kmem_cache_create(type->name, size, + RXE_POOL_ALIGN, + RXE_POOL_CACHE_FLAGS, NULL); + if (!type->cache) { + pr_err("Unable to init kmem cache for %s\n", + type->name); + err = -ENOMEM; + goto err1; + } + } + + return 0; + +err1: + while (--i >= 0) { + kmem_cache_destroy(type->cache); + type->cache = NULL; + } + + return err; +} + +void rxe_cache_exit(void) +{ + int i; + struct rxe_type_info *type; + + for (i = 0; i < RXE_NUM_TYPES; i++) { + type = &rxe_type_info[i]; + kmem_cache_destroy(type->cache); + type->cache = NULL; + } +} + +static int rxe_pool_init_index(struct rxe_pool *pool, u32 max, u32 min) +{ + int err = 0; + size_t size; + + if ((max - min + 1) < pool->max_elem) { + pr_warn("not enough indices for max_elem\n"); + err = -EINVAL; + goto out; + } + + pool->max_index = max; + pool->min_index = min; + + size = BITS_TO_LONGS(max - min + 1) * sizeof(long); + pool->table = kmalloc(size, GFP_KERNEL); + if (!pool->table) { + pr_warn("no memory for bit table\n"); + err = -ENOMEM; + goto out; + } + + pool->table_size = size; + bitmap_zero(pool->table, max - min + 1); + +out: + return err; +} + +int rxe_pool_init( + struct rxe_dev *rxe, + struct rxe_pool *pool, + enum rxe_elem_type type, + unsigned max_elem) +{ + int err = 0; + size_t size = rxe_type_info[type].size; + + memset(pool, 0, sizeof(*pool)); + + pool->rxe = rxe; + pool->type = type; + pool->max_elem = max_elem; + pool->elem_size = ALIGN(size, RXE_POOL_ALIGN); + pool->flags = rxe_type_info[type].flags; + pool->tree = RB_ROOT; + pool->cleanup = rxe_type_info[type].cleanup; + + atomic_set(&pool->num_elem, 0); + + kref_init(&pool->ref_cnt); + + spin_lock_init(&pool->pool_lock); + + if (rxe_type_info[type].flags & RXE_POOL_INDEX) { + err = rxe_pool_init_index(pool, + rxe_type_info[type].max_index, + rxe_type_info[type].min_index); + if (err) + goto out; + } + + if (rxe_type_info[type].flags & RXE_POOL_KEY) { + pool->key_offset = rxe_type_info[type].key_offset; + pool->key_size = rxe_type_info[type].key_size; + } + + pool->state = rxe_pool_valid; + +out: + return err; +} + +static void rxe_pool_release(struct kref *kref) +{ + struct rxe_pool *pool = container_of(kref, struct rxe_pool, ref_cnt); + + pool->state = rxe_pool_invalid; + kfree(pool->table); +} + +static void rxe_pool_put(struct rxe_pool *pool) +{ + kref_put(&pool->ref_cnt, rxe_pool_release); +} + +int rxe_pool_cleanup(struct rxe_pool *pool) +{ + unsigned long flags; + + spin_lock_irqsave(&pool->pool_lock, flags); + pool->state = rxe_pool_invalid; + if (atomic_read(&pool->num_elem) > 0) + pr_warn("%s pool destroyed with unfree'd elem\n", + pool_name(pool)); + spin_unlock_irqrestore(&pool->pool_lock, flags); + + rxe_pool_put(pool); + + return 0; +} + +static u32 alloc_index(struct rxe_pool *pool) +{ + u32 index; + u32 range = pool->max_index - pool->min_index + 1; + + index = find_next_zero_bit(pool->table, range, pool->last); + if (index >= range) + index = find_first_zero_bit(pool->table, range); + + set_bit(index, pool->table); + pool->last = index; + return index + pool->min_index; +} + +static void insert_index(struct rxe_pool *pool, struct rxe_pool_entry *new) +{ + struct rb_node **link = &pool->tree.rb_node; + struct rb_node *parent = NULL; + struct rxe_pool_entry *elem; + + while (*link) { + parent = *link; + elem = rb_entry(parent, struct rxe_pool_entry, node); + + if (elem->index == new->index) { + pr_warn("element already exists!\n"); + goto out; + } + + if (elem->index > new->index) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, &pool->tree); +out: + return; +} + +static void insert_key(struct rxe_pool *pool, struct rxe_pool_entry *new) +{ + struct rb_node **link = &pool->tree.rb_node; + struct rb_node *parent = NULL; + struct rxe_pool_entry *elem; + int cmp; + + while (*link) { + parent = *link; + elem = rb_entry(parent, struct rxe_pool_entry, node); + + cmp = memcmp((u8 *)elem + pool->key_offset, + (u8 *)new + pool->key_offset, pool->key_size); + + if (cmp == 0) { + pr_warn("key already exists!\n"); + goto out; + } + + if (cmp > 0) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, &pool->tree); +out: + return; +} + +void rxe_add_key(void *arg, void *key) +{ + struct rxe_pool_entry *elem = arg; + struct rxe_pool *pool = elem->pool; + unsigned long flags; + + spin_lock_irqsave(&pool->pool_lock, flags); + memcpy((u8 *)elem + pool->key_offset, key, pool->key_size); + insert_key(pool, elem); + spin_unlock_irqrestore(&pool->pool_lock, flags); +} + +void rxe_drop_key(void *arg) +{ + struct rxe_pool_entry *elem = arg; + struct rxe_pool *pool = elem->pool; + unsigned long flags; + + spin_lock_irqsave(&pool->pool_lock, flags); + rb_erase(&elem->node, &pool->tree); + spin_unlock_irqrestore(&pool->pool_lock, flags); +} + +void rxe_add_index(void *arg) +{ + struct rxe_pool_entry *elem = arg; + struct rxe_pool *pool = elem->pool; + unsigned long flags; + + spin_lock_irqsave(&pool->pool_lock, flags); + elem->index = alloc_index(pool); + insert_index(pool, elem); + spin_unlock_irqrestore(&pool->pool_lock, flags); +} + +void rxe_drop_index(void *arg) +{ + struct rxe_pool_entry *elem = arg; + struct rxe_pool *pool = elem->pool; + unsigned long flags; + + spin_lock_irqsave(&pool->pool_lock, flags); + clear_bit(elem->index - pool->min_index, pool->table); + rb_erase(&elem->node, &pool->tree); + spin_unlock_irqrestore(&pool->pool_lock, flags); +} + +void *rxe_alloc(struct rxe_pool *pool) +{ + struct rxe_pool_entry *elem; + unsigned long flags; + + might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC)); + + spin_lock_irqsave(&pool->pool_lock, flags); + if (pool->state != rxe_pool_valid) { + spin_unlock_irqrestore(&pool->pool_lock, flags); + return NULL; + } + kref_get(&pool->ref_cnt); + spin_unlock_irqrestore(&pool->pool_lock, flags); + + kref_get(&pool->rxe->ref_cnt); + + if (atomic_inc_return(&pool->num_elem) > pool->max_elem) { + atomic_dec(&pool->num_elem); + rxe_dev_put(pool->rxe); + rxe_pool_put(pool); + return NULL; + } + + elem = kmem_cache_zalloc(pool_cache(pool), + (pool->flags & RXE_POOL_ATOMIC) ? + GFP_ATOMIC : GFP_KERNEL); + + elem->pool = pool; + kref_init(&elem->ref_cnt); + + return elem; +} + +void rxe_elem_release(struct kref *kref) +{ + struct rxe_pool_entry *elem = + container_of(kref, struct rxe_pool_entry, ref_cnt); + struct rxe_pool *pool = elem->pool; + + if (pool->cleanup) + pool->cleanup(elem); + + kmem_cache_free(pool_cache(pool), elem); + atomic_dec(&pool->num_elem); + rxe_dev_put(pool->rxe); + rxe_pool_put(pool); +} + +void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) +{ + struct rb_node *node = NULL; + struct rxe_pool_entry *elem = NULL; + unsigned long flags; + + spin_lock_irqsave(&pool->pool_lock, flags); + + if (pool->state != rxe_pool_valid) + goto out; + + node = pool->tree.rb_node; + + while (node) { + elem = rb_entry(node, struct rxe_pool_entry, node); + + if (elem->index > index) + node = node->rb_left; + else if (elem->index < index) + node = node->rb_right; + else + break; + } + + if (node) + kref_get(&elem->ref_cnt); + +out: + spin_unlock_irqrestore(&pool->pool_lock, flags); + return node ? (void *)elem : NULL; +} + +void *rxe_pool_get_key(struct rxe_pool *pool, void *key) +{ + struct rb_node *node = NULL; + struct rxe_pool_entry *elem = NULL; + int cmp; + unsigned long flags; + + spin_lock_irqsave(&pool->pool_lock, flags); + + if (pool->state != rxe_pool_valid) + goto out; + + node = pool->tree.rb_node; + + while (node) { + elem = rb_entry(node, struct rxe_pool_entry, node); + + cmp = memcmp((u8 *)elem + pool->key_offset, + key, pool->key_size); + + if (cmp > 0) + node = node->rb_left; + else if (cmp < 0) + node = node->rb_right; + else + break; + } + + if (node) + kref_get(&elem->ref_cnt); + +out: + spin_unlock_irqrestore(&pool->pool_lock, flags); + return node ? ((void *)elem) : NULL; +} diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h new file mode 100644 index 000000000000..4d04830adcae --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_POOL_H +#define RXE_POOL_H + +#define RXE_POOL_ALIGN (16) +#define RXE_POOL_CACHE_FLAGS (0) + +enum rxe_pool_flags { + RXE_POOL_ATOMIC = BIT(0), + RXE_POOL_INDEX = BIT(1), + RXE_POOL_KEY = BIT(2), +}; + +enum rxe_elem_type { + RXE_TYPE_UC, + RXE_TYPE_PD, + RXE_TYPE_AH, + RXE_TYPE_SRQ, + RXE_TYPE_QP, + RXE_TYPE_CQ, + RXE_TYPE_MR, + RXE_TYPE_MW, + RXE_TYPE_MC_GRP, + RXE_TYPE_MC_ELEM, + RXE_NUM_TYPES, /* keep me last */ +}; + +struct rxe_type_info { + char *name; + size_t size; + void (*cleanup)(void *obj); + enum rxe_pool_flags flags; + u32 max_index; + u32 min_index; + size_t key_offset; + size_t key_size; + struct kmem_cache *cache; +}; + +extern struct rxe_type_info rxe_type_info[]; + +enum rxe_pool_state { + rxe_pool_invalid, + rxe_pool_valid, +}; + +struct rxe_pool_entry { + struct rxe_pool *pool; + struct kref ref_cnt; + struct list_head list; + + /* only used if indexed or keyed */ + struct rb_node node; + u32 index; +}; + +struct rxe_pool { + struct rxe_dev *rxe; + spinlock_t pool_lock; /* pool spinlock */ + size_t elem_size; + struct kref ref_cnt; + void (*cleanup)(void *obj); + enum rxe_pool_state state; + enum rxe_pool_flags flags; + enum rxe_elem_type type; + + unsigned int max_elem; + atomic_t num_elem; + + /* only used if indexed or keyed */ + struct rb_root tree; + unsigned long *table; + size_t table_size; + u32 max_index; + u32 min_index; + u32 last; + size_t key_offset; + size_t key_size; +}; + +/* initialize slab caches for managed objects */ +int rxe_cache_init(void); + +/* cleanup slab caches for managed objects */ +void rxe_cache_exit(void); + +/* initialize a pool of objects with given limit on + * number of elements. gets parameters from rxe_type_info + * pool elements will be allocated out of a slab cache + */ +int rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool, + enum rxe_elem_type type, u32 max_elem); + +/* free resources from object pool */ +int rxe_pool_cleanup(struct rxe_pool *pool); + +/* allocate an object from pool */ +void *rxe_alloc(struct rxe_pool *pool); + +/* assign an index to an indexed object and insert object into + * pool's rb tree + */ +void rxe_add_index(void *elem); + +/* drop an index and remove object from rb tree */ +void rxe_drop_index(void *elem); + +/* assign a key to a keyed object and insert object into + * pool's rb tree + */ +void rxe_add_key(void *elem, void *key); + +/* remove elem from rb tree */ +void rxe_drop_key(void *elem); + +/* lookup an indexed object from index. takes a reference on object */ +void *rxe_pool_get_index(struct rxe_pool *pool, u32 index); + +/* lookup keyed object from key. takes a reference on the object */ +void *rxe_pool_get_key(struct rxe_pool *pool, void *key); + +/* cleanup an object when all references are dropped */ +void rxe_elem_release(struct kref *kref); + +/* take a reference on an object */ +#define rxe_add_ref(elem) kref_get(&(elem)->pelem.ref_cnt) + +/* drop a reference on an object */ +#define rxe_drop_ref(elem) kref_put(&(elem)->pelem.ref_cnt, rxe_elem_release) + +#endif /* RXE_POOL_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c new file mode 100644 index 000000000000..22ba24f2a2c1 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -0,0 +1,851 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/skbuff.h> +#include <linux/delay.h> +#include <linux/sched.h> + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" +#include "rxe_task.h" + +char *rxe_qp_state_name[] = { + [QP_STATE_RESET] = "RESET", + [QP_STATE_INIT] = "INIT", + [QP_STATE_READY] = "READY", + [QP_STATE_DRAIN] = "DRAIN", + [QP_STATE_DRAINED] = "DRAINED", + [QP_STATE_ERROR] = "ERROR", +}; + +static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap, + int has_srq) +{ + if (cap->max_send_wr > rxe->attr.max_qp_wr) { + pr_warn("invalid send wr = %d > %d\n", + cap->max_send_wr, rxe->attr.max_qp_wr); + goto err1; + } + + if (cap->max_send_sge > rxe->attr.max_sge) { + pr_warn("invalid send sge = %d > %d\n", + cap->max_send_sge, rxe->attr.max_sge); + goto err1; + } + + if (!has_srq) { + if (cap->max_recv_wr > rxe->attr.max_qp_wr) { + pr_warn("invalid recv wr = %d > %d\n", + cap->max_recv_wr, rxe->attr.max_qp_wr); + goto err1; + } + + if (cap->max_recv_sge > rxe->attr.max_sge) { + pr_warn("invalid recv sge = %d > %d\n", + cap->max_recv_sge, rxe->attr.max_sge); + goto err1; + } + } + + if (cap->max_inline_data > rxe->max_inline_data) { + pr_warn("invalid max inline data = %d > %d\n", + cap->max_inline_data, rxe->max_inline_data); + goto err1; + } + + return 0; + +err1: + return -EINVAL; +} + +int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init) +{ + struct ib_qp_cap *cap = &init->cap; + struct rxe_port *port; + int port_num = init->port_num; + + if (!init->recv_cq || !init->send_cq) { + pr_warn("missing cq\n"); + goto err1; + } + + if (rxe_qp_chk_cap(rxe, cap, !!init->srq)) + goto err1; + + if (init->qp_type == IB_QPT_SMI || init->qp_type == IB_QPT_GSI) { + if (port_num != 1) { + pr_warn("invalid port = %d\n", port_num); + goto err1; + } + + port = &rxe->port; + + if (init->qp_type == IB_QPT_SMI && port->qp_smi_index) { + pr_warn("SMI QP exists for port %d\n", port_num); + goto err1; + } + + if (init->qp_type == IB_QPT_GSI && port->qp_gsi_index) { + pr_warn("GSI QP exists for port %d\n", port_num); + goto err1; + } + } + + return 0; + +err1: + return -EINVAL; +} + +static int alloc_rd_atomic_resources(struct rxe_qp *qp, unsigned int n) +{ + qp->resp.res_head = 0; + qp->resp.res_tail = 0; + qp->resp.resources = kcalloc(n, sizeof(struct resp_res), GFP_KERNEL); + + if (!qp->resp.resources) + return -ENOMEM; + + return 0; +} + +static void free_rd_atomic_resources(struct rxe_qp *qp) +{ + if (qp->resp.resources) { + int i; + + for (i = 0; i < qp->attr.max_rd_atomic; i++) { + struct resp_res *res = &qp->resp.resources[i]; + + free_rd_atomic_resource(qp, res); + } + kfree(qp->resp.resources); + qp->resp.resources = NULL; + } +} + +void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res) +{ + if (res->type == RXE_ATOMIC_MASK) { + rxe_drop_ref(qp); + kfree_skb(res->atomic.skb); + } else if (res->type == RXE_READ_MASK) { + if (res->read.mr) + rxe_drop_ref(res->read.mr); + } + res->type = 0; +} + +static void cleanup_rd_atomic_resources(struct rxe_qp *qp) +{ + int i; + struct resp_res *res; + + if (qp->resp.resources) { + for (i = 0; i < qp->attr.max_rd_atomic; i++) { + res = &qp->resp.resources[i]; + free_rd_atomic_resource(qp, res); + } + } +} + +static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_init_attr *init) +{ + struct rxe_port *port; + u32 qpn; + + qp->sq_sig_type = init->sq_sig_type; + qp->attr.path_mtu = 1; + qp->mtu = ib_mtu_enum_to_int(qp->attr.path_mtu); + + qpn = qp->pelem.index; + port = &rxe->port; + + switch (init->qp_type) { + case IB_QPT_SMI: + qp->ibqp.qp_num = 0; + port->qp_smi_index = qpn; + qp->attr.port_num = init->port_num; + break; + + case IB_QPT_GSI: + qp->ibqp.qp_num = 1; + port->qp_gsi_index = qpn; + qp->attr.port_num = init->port_num; + break; + + default: + qp->ibqp.qp_num = qpn; + break; + } + + INIT_LIST_HEAD(&qp->grp_list); + + skb_queue_head_init(&qp->send_pkts); + + spin_lock_init(&qp->grp_lock); + spin_lock_init(&qp->state_lock); + + atomic_set(&qp->ssn, 0); + atomic_set(&qp->skb_out, 0); +} + +static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_init_attr *init, + struct ib_ucontext *context, struct ib_udata *udata) +{ + int err; + int wqe_size; + + err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk); + if (err < 0) + return err; + qp->sk->sk->sk_user_data = qp; + + qp->sq.max_wr = init->cap.max_send_wr; + qp->sq.max_sge = init->cap.max_send_sge; + qp->sq.max_inline = init->cap.max_inline_data; + + wqe_size = max_t(int, sizeof(struct rxe_send_wqe) + + qp->sq.max_sge * sizeof(struct ib_sge), + sizeof(struct rxe_send_wqe) + + qp->sq.max_inline); + + qp->sq.queue = rxe_queue_init(rxe, + &qp->sq.max_wr, + wqe_size); + if (!qp->sq.queue) + return -ENOMEM; + + err = do_mmap_info(rxe, udata, true, + context, qp->sq.queue->buf, + qp->sq.queue->buf_size, &qp->sq.queue->ip); + + if (err) { + kvfree(qp->sq.queue->buf); + kfree(qp->sq.queue); + return err; + } + + qp->req.wqe_index = producer_index(qp->sq.queue); + qp->req.state = QP_STATE_RESET; + qp->req.opcode = -1; + qp->comp.opcode = -1; + + spin_lock_init(&qp->sq.sq_lock); + skb_queue_head_init(&qp->req_pkts); + + rxe_init_task(rxe, &qp->req.task, qp, + rxe_requester, "req"); + rxe_init_task(rxe, &qp->comp.task, qp, + rxe_completer, "comp"); + + init_timer(&qp->rnr_nak_timer); + qp->rnr_nak_timer.function = rnr_nak_timer; + qp->rnr_nak_timer.data = (unsigned long)qp; + + init_timer(&qp->retrans_timer); + qp->retrans_timer.function = retransmit_timer; + qp->retrans_timer.data = (unsigned long)qp; + qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */ + + return 0; +} + +static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_init_attr *init, + struct ib_ucontext *context, struct ib_udata *udata) +{ + int err; + int wqe_size; + + if (!qp->srq) { + qp->rq.max_wr = init->cap.max_recv_wr; + qp->rq.max_sge = init->cap.max_recv_sge; + + wqe_size = rcv_wqe_size(qp->rq.max_sge); + + pr_debug("max_wr = %d, max_sge = %d, wqe_size = %d\n", + qp->rq.max_wr, qp->rq.max_sge, wqe_size); + + qp->rq.queue = rxe_queue_init(rxe, + &qp->rq.max_wr, + wqe_size); + if (!qp->rq.queue) + return -ENOMEM; + + err = do_mmap_info(rxe, udata, false, context, + qp->rq.queue->buf, + qp->rq.queue->buf_size, + &qp->rq.queue->ip); + if (err) { + kvfree(qp->rq.queue->buf); + kfree(qp->rq.queue); + return err; + } + } + + spin_lock_init(&qp->rq.producer_lock); + spin_lock_init(&qp->rq.consumer_lock); + + skb_queue_head_init(&qp->resp_pkts); + + rxe_init_task(rxe, &qp->resp.task, qp, + rxe_responder, "resp"); + + qp->resp.opcode = OPCODE_NONE; + qp->resp.msn = 0; + qp->resp.state = QP_STATE_RESET; + + return 0; +} + +/* called by the create qp verb */ +int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, + struct ib_qp_init_attr *init, struct ib_udata *udata, + struct ib_pd *ibpd) +{ + int err; + struct rxe_cq *rcq = to_rcq(init->recv_cq); + struct rxe_cq *scq = to_rcq(init->send_cq); + struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL; + struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL; + + rxe_add_ref(pd); + rxe_add_ref(rcq); + rxe_add_ref(scq); + if (srq) + rxe_add_ref(srq); + + qp->pd = pd; + qp->rcq = rcq; + qp->scq = scq; + qp->srq = srq; + + rxe_qp_init_misc(rxe, qp, init); + + err = rxe_qp_init_req(rxe, qp, init, context, udata); + if (err) + goto err1; + + err = rxe_qp_init_resp(rxe, qp, init, context, udata); + if (err) + goto err2; + + qp->attr.qp_state = IB_QPS_RESET; + qp->valid = 1; + + return 0; + +err2: + rxe_queue_cleanup(qp->sq.queue); +err1: + if (srq) + rxe_drop_ref(srq); + rxe_drop_ref(scq); + rxe_drop_ref(rcq); + rxe_drop_ref(pd); + + return err; +} + +/* called by the query qp verb */ +int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init) +{ + init->event_handler = qp->ibqp.event_handler; + init->qp_context = qp->ibqp.qp_context; + init->send_cq = qp->ibqp.send_cq; + init->recv_cq = qp->ibqp.recv_cq; + init->srq = qp->ibqp.srq; + + init->cap.max_send_wr = qp->sq.max_wr; + init->cap.max_send_sge = qp->sq.max_sge; + init->cap.max_inline_data = qp->sq.max_inline; + + if (!qp->srq) { + init->cap.max_recv_wr = qp->rq.max_wr; + init->cap.max_recv_sge = qp->rq.max_sge; + } + + init->sq_sig_type = qp->sq_sig_type; + + init->qp_type = qp->ibqp.qp_type; + init->port_num = 1; + + return 0; +} + +/* called by the modify qp verb, this routine checks all the parameters before + * making any changes + */ +int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_attr *attr, int mask) +{ + enum ib_qp_state cur_state = (mask & IB_QP_CUR_STATE) ? + attr->cur_qp_state : qp->attr.qp_state; + enum ib_qp_state new_state = (mask & IB_QP_STATE) ? + attr->qp_state : cur_state; + + if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask, + IB_LINK_LAYER_ETHERNET)) { + pr_warn("invalid mask or state for qp\n"); + goto err1; + } + + if (mask & IB_QP_STATE) { + if (cur_state == IB_QPS_SQD) { + if (qp->req.state == QP_STATE_DRAIN && + new_state != IB_QPS_ERR) + goto err1; + } + } + + if (mask & IB_QP_PORT) { + if (attr->port_num != 1) { + pr_warn("invalid port %d\n", attr->port_num); + goto err1; + } + } + + if (mask & IB_QP_CAP && rxe_qp_chk_cap(rxe, &attr->cap, !!qp->srq)) + goto err1; + + if (mask & IB_QP_AV && rxe_av_chk_attr(rxe, &attr->ah_attr)) + goto err1; + + if (mask & IB_QP_ALT_PATH) { + if (rxe_av_chk_attr(rxe, &attr->alt_ah_attr)) + goto err1; + if (attr->alt_port_num != 1) { + pr_warn("invalid alt port %d\n", attr->alt_port_num); + goto err1; + } + if (attr->alt_timeout > 31) { + pr_warn("invalid QP alt timeout %d > 31\n", + attr->alt_timeout); + goto err1; + } + } + + if (mask & IB_QP_PATH_MTU) { + struct rxe_port *port = &rxe->port; + + enum ib_mtu max_mtu = port->attr.max_mtu; + enum ib_mtu mtu = attr->path_mtu; + + if (mtu > max_mtu) { + pr_debug("invalid mtu (%d) > (%d)\n", + ib_mtu_enum_to_int(mtu), + ib_mtu_enum_to_int(max_mtu)); + goto err1; + } + } + + if (mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attr->max_rd_atomic > rxe->attr.max_qp_rd_atom) { + pr_warn("invalid max_rd_atomic %d > %d\n", + attr->max_rd_atomic, + rxe->attr.max_qp_rd_atom); + goto err1; + } + } + + if (mask & IB_QP_TIMEOUT) { + if (attr->timeout > 31) { + pr_warn("invalid QP timeout %d > 31\n", + attr->timeout); + goto err1; + } + } + + return 0; + +err1: + return -EINVAL; +} + +/* move the qp to the reset state */ +static void rxe_qp_reset(struct rxe_qp *qp) +{ + /* stop tasks from running */ + rxe_disable_task(&qp->resp.task); + + /* stop request/comp */ + if (qp->sq.queue) { + if (qp_type(qp) == IB_QPT_RC) + rxe_disable_task(&qp->comp.task); + rxe_disable_task(&qp->req.task); + } + + /* move qp to the reset state */ + qp->req.state = QP_STATE_RESET; + qp->resp.state = QP_STATE_RESET; + + /* let state machines reset themselves drain work and packet queues + * etc. + */ + __rxe_do_task(&qp->resp.task); + + if (qp->sq.queue) { + __rxe_do_task(&qp->comp.task); + __rxe_do_task(&qp->req.task); + } + + /* cleanup attributes */ + atomic_set(&qp->ssn, 0); + qp->req.opcode = -1; + qp->req.need_retry = 0; + qp->req.noack_pkts = 0; + qp->resp.msn = 0; + qp->resp.opcode = -1; + qp->resp.drop_msg = 0; + qp->resp.goto_error = 0; + qp->resp.sent_psn_nak = 0; + + if (qp->resp.mr) { + rxe_drop_ref(qp->resp.mr); + qp->resp.mr = NULL; + } + + cleanup_rd_atomic_resources(qp); + + /* reenable tasks */ + rxe_enable_task(&qp->resp.task); + + if (qp->sq.queue) { + if (qp_type(qp) == IB_QPT_RC) + rxe_enable_task(&qp->comp.task); + + rxe_enable_task(&qp->req.task); + } +} + +/* drain the send queue */ +static void rxe_qp_drain(struct rxe_qp *qp) +{ + if (qp->sq.queue) { + if (qp->req.state != QP_STATE_DRAINED) { + qp->req.state = QP_STATE_DRAIN; + if (qp_type(qp) == IB_QPT_RC) + rxe_run_task(&qp->comp.task, 1); + else + __rxe_do_task(&qp->comp.task); + rxe_run_task(&qp->req.task, 1); + } + } +} + +/* move the qp to the error state */ +void rxe_qp_error(struct rxe_qp *qp) +{ + qp->req.state = QP_STATE_ERROR; + qp->resp.state = QP_STATE_ERROR; + + /* drain work and packet queues */ + rxe_run_task(&qp->resp.task, 1); + + if (qp_type(qp) == IB_QPT_RC) + rxe_run_task(&qp->comp.task, 1); + else + __rxe_do_task(&qp->comp.task); + rxe_run_task(&qp->req.task, 1); +} + +/* called by the modify qp verb */ +int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, + struct ib_udata *udata) +{ + int err; + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + union ib_gid sgid; + struct ib_gid_attr sgid_attr; + + if (mask & IB_QP_MAX_QP_RD_ATOMIC) { + int max_rd_atomic = __roundup_pow_of_two(attr->max_rd_atomic); + + free_rd_atomic_resources(qp); + + err = alloc_rd_atomic_resources(qp, max_rd_atomic); + if (err) + return err; + + qp->attr.max_rd_atomic = max_rd_atomic; + atomic_set(&qp->req.rd_atomic, max_rd_atomic); + } + + if (mask & IB_QP_CUR_STATE) + qp->attr.cur_qp_state = attr->qp_state; + + if (mask & IB_QP_EN_SQD_ASYNC_NOTIFY) + qp->attr.en_sqd_async_notify = attr->en_sqd_async_notify; + + if (mask & IB_QP_ACCESS_FLAGS) + qp->attr.qp_access_flags = attr->qp_access_flags; + + if (mask & IB_QP_PKEY_INDEX) + qp->attr.pkey_index = attr->pkey_index; + + if (mask & IB_QP_PORT) + qp->attr.port_num = attr->port_num; + + if (mask & IB_QP_QKEY) + qp->attr.qkey = attr->qkey; + + if (mask & IB_QP_AV) { + ib_get_cached_gid(&rxe->ib_dev, 1, + attr->ah_attr.grh.sgid_index, &sgid, + &sgid_attr); + rxe_av_from_attr(rxe, attr->port_num, &qp->pri_av, + &attr->ah_attr); + rxe_av_fill_ip_info(rxe, &qp->pri_av, &attr->ah_attr, + &sgid_attr, &sgid); + if (sgid_attr.ndev) + dev_put(sgid_attr.ndev); + } + + if (mask & IB_QP_ALT_PATH) { + ib_get_cached_gid(&rxe->ib_dev, 1, + attr->alt_ah_attr.grh.sgid_index, &sgid, + &sgid_attr); + + rxe_av_from_attr(rxe, attr->alt_port_num, &qp->alt_av, + &attr->alt_ah_attr); + rxe_av_fill_ip_info(rxe, &qp->alt_av, &attr->alt_ah_attr, + &sgid_attr, &sgid); + if (sgid_attr.ndev) + dev_put(sgid_attr.ndev); + + qp->attr.alt_port_num = attr->alt_port_num; + qp->attr.alt_pkey_index = attr->alt_pkey_index; + qp->attr.alt_timeout = attr->alt_timeout; + } + + if (mask & IB_QP_PATH_MTU) { + qp->attr.path_mtu = attr->path_mtu; + qp->mtu = ib_mtu_enum_to_int(attr->path_mtu); + } + + if (mask & IB_QP_TIMEOUT) { + qp->attr.timeout = attr->timeout; + if (attr->timeout == 0) { + qp->qp_timeout_jiffies = 0; + } else { + /* According to the spec, timeout = 4.096 * 2 ^ attr->timeout [us] */ + int j = nsecs_to_jiffies(4096ULL << attr->timeout); + + qp->qp_timeout_jiffies = j ? j : 1; + } + } + + if (mask & IB_QP_RETRY_CNT) { + qp->attr.retry_cnt = attr->retry_cnt; + qp->comp.retry_cnt = attr->retry_cnt; + pr_debug("set retry count = %d\n", attr->retry_cnt); + } + + if (mask & IB_QP_RNR_RETRY) { + qp->attr.rnr_retry = attr->rnr_retry; + qp->comp.rnr_retry = attr->rnr_retry; + pr_debug("set rnr retry count = %d\n", attr->rnr_retry); + } + + if (mask & IB_QP_RQ_PSN) { + qp->attr.rq_psn = (attr->rq_psn & BTH_PSN_MASK); + qp->resp.psn = qp->attr.rq_psn; + pr_debug("set resp psn = 0x%x\n", qp->resp.psn); + } + + if (mask & IB_QP_MIN_RNR_TIMER) { + qp->attr.min_rnr_timer = attr->min_rnr_timer; + pr_debug("set min rnr timer = 0x%x\n", + attr->min_rnr_timer); + } + + if (mask & IB_QP_SQ_PSN) { + qp->attr.sq_psn = (attr->sq_psn & BTH_PSN_MASK); + qp->req.psn = qp->attr.sq_psn; + qp->comp.psn = qp->attr.sq_psn; + pr_debug("set req psn = 0x%x\n", qp->req.psn); + } + + if (mask & IB_QP_MAX_DEST_RD_ATOMIC) { + qp->attr.max_dest_rd_atomic = + __roundup_pow_of_two(attr->max_dest_rd_atomic); + } + + if (mask & IB_QP_PATH_MIG_STATE) + qp->attr.path_mig_state = attr->path_mig_state; + + if (mask & IB_QP_DEST_QPN) + qp->attr.dest_qp_num = attr->dest_qp_num; + + if (mask & IB_QP_STATE) { + qp->attr.qp_state = attr->qp_state; + + switch (attr->qp_state) { + case IB_QPS_RESET: + pr_debug("qp state -> RESET\n"); + rxe_qp_reset(qp); + break; + + case IB_QPS_INIT: + pr_debug("qp state -> INIT\n"); + qp->req.state = QP_STATE_INIT; + qp->resp.state = QP_STATE_INIT; + break; + + case IB_QPS_RTR: + pr_debug("qp state -> RTR\n"); + qp->resp.state = QP_STATE_READY; + break; + + case IB_QPS_RTS: + pr_debug("qp state -> RTS\n"); + qp->req.state = QP_STATE_READY; + break; + + case IB_QPS_SQD: + pr_debug("qp state -> SQD\n"); + rxe_qp_drain(qp); + break; + + case IB_QPS_SQE: + pr_warn("qp state -> SQE !!?\n"); + /* Not possible from modify_qp. */ + break; + + case IB_QPS_ERR: + pr_debug("qp state -> ERR\n"); + rxe_qp_error(qp); + break; + } + } + + return 0; +} + +/* called by the query qp verb */ +int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + + *attr = qp->attr; + + attr->rq_psn = qp->resp.psn; + attr->sq_psn = qp->req.psn; + + attr->cap.max_send_wr = qp->sq.max_wr; + attr->cap.max_send_sge = qp->sq.max_sge; + attr->cap.max_inline_data = qp->sq.max_inline; + + if (!qp->srq) { + attr->cap.max_recv_wr = qp->rq.max_wr; + attr->cap.max_recv_sge = qp->rq.max_sge; + } + + rxe_av_to_attr(rxe, &qp->pri_av, &attr->ah_attr); + rxe_av_to_attr(rxe, &qp->alt_av, &attr->alt_ah_attr); + + if (qp->req.state == QP_STATE_DRAIN) { + attr->sq_draining = 1; + /* applications that get this state + * typically spin on it. yield the + * processor + */ + cond_resched(); + } else { + attr->sq_draining = 0; + } + + pr_debug("attr->sq_draining = %d\n", attr->sq_draining); + + return 0; +} + +/* called by the destroy qp verb */ +void rxe_qp_destroy(struct rxe_qp *qp) +{ + qp->valid = 0; + qp->qp_timeout_jiffies = 0; + rxe_cleanup_task(&qp->resp.task); + + del_timer_sync(&qp->retrans_timer); + del_timer_sync(&qp->rnr_nak_timer); + + rxe_cleanup_task(&qp->req.task); + if (qp_type(qp) == IB_QPT_RC) + rxe_cleanup_task(&qp->comp.task); + + /* flush out any receive wr's or pending requests */ + __rxe_do_task(&qp->req.task); + if (qp->sq.queue) { + __rxe_do_task(&qp->comp.task); + __rxe_do_task(&qp->req.task); + } +} + +/* called when the last reference to the qp is dropped */ +void rxe_qp_cleanup(void *arg) +{ + struct rxe_qp *qp = arg; + + rxe_drop_all_mcast_groups(qp); + + if (qp->sq.queue) + rxe_queue_cleanup(qp->sq.queue); + + if (qp->srq) + rxe_drop_ref(qp->srq); + + if (qp->rq.queue) + rxe_queue_cleanup(qp->rq.queue); + + if (qp->scq) + rxe_drop_ref(qp->scq); + if (qp->rcq) + rxe_drop_ref(qp->rcq); + if (qp->pd) + rxe_drop_ref(qp->pd); + + if (qp->resp.mr) { + rxe_drop_ref(qp->resp.mr); + qp->resp.mr = NULL; + } + + free_rd_atomic_resources(qp); + + kernel_sock_shutdown(qp->sk, SHUT_RDWR); +} diff --git a/drivers/infiniband/sw/rxe/rxe_queue.c b/drivers/infiniband/sw/rxe/rxe_queue.c new file mode 100644 index 000000000000..08274254eb88 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_queue.c @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must retailuce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/vmalloc.h> +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +int do_mmap_info(struct rxe_dev *rxe, + struct ib_udata *udata, + bool is_req, + struct ib_ucontext *context, + struct rxe_queue_buf *buf, + size_t buf_size, + struct rxe_mmap_info **ip_p) +{ + int err; + u32 len, offset; + struct rxe_mmap_info *ip = NULL; + + if (udata) { + if (is_req) { + len = udata->outlen - sizeof(struct mminfo); + offset = sizeof(struct mminfo); + } else { + len = udata->outlen; + offset = 0; + } + + if (len < sizeof(ip->info)) + goto err1; + + ip = rxe_create_mmap_info(rxe, buf_size, context, buf); + if (!ip) + goto err1; + + err = copy_to_user(udata->outbuf + offset, &ip->info, + sizeof(ip->info)); + if (err) + goto err2; + + spin_lock_bh(&rxe->pending_lock); + list_add(&ip->pending_mmaps, &rxe->pending_mmaps); + spin_unlock_bh(&rxe->pending_lock); + } + + *ip_p = ip; + + return 0; + +err2: + kfree(ip); +err1: + return -EINVAL; +} + +struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe, + int *num_elem, + unsigned int elem_size) +{ + struct rxe_queue *q; + size_t buf_size; + unsigned int num_slots; + + /* num_elem == 0 is allowed, but uninteresting */ + if (*num_elem < 0) + goto err1; + + q = kmalloc(sizeof(*q), GFP_KERNEL); + if (!q) + goto err1; + + q->rxe = rxe; + + /* used in resize, only need to copy used part of queue */ + q->elem_size = elem_size; + + /* pad element up to at least a cacheline and always a power of 2 */ + if (elem_size < cache_line_size()) + elem_size = cache_line_size(); + elem_size = roundup_pow_of_two(elem_size); + + q->log2_elem_size = order_base_2(elem_size); + + num_slots = *num_elem + 1; + num_slots = roundup_pow_of_two(num_slots); + q->index_mask = num_slots - 1; + + buf_size = sizeof(struct rxe_queue_buf) + num_slots * elem_size; + + q->buf = vmalloc_user(buf_size); + if (!q->buf) + goto err2; + + q->buf->log2_elem_size = q->log2_elem_size; + q->buf->index_mask = q->index_mask; + + q->buf_size = buf_size; + + *num_elem = num_slots - 1; + return q; + +err2: + kfree(q); +err1: + return NULL; +} + +/* copies elements from original q to new q and then swaps the contents of the + * two q headers. This is so that if anyone is holding a pointer to q it will + * still work + */ +static int resize_finish(struct rxe_queue *q, struct rxe_queue *new_q, + unsigned int num_elem) +{ + if (!queue_empty(q) && (num_elem < queue_count(q))) + return -EINVAL; + + while (!queue_empty(q)) { + memcpy(producer_addr(new_q), consumer_addr(q), + new_q->elem_size); + advance_producer(new_q); + advance_consumer(q); + } + + swap(*q, *new_q); + + return 0; +} + +int rxe_queue_resize(struct rxe_queue *q, + unsigned int *num_elem_p, + unsigned int elem_size, + struct ib_ucontext *context, + struct ib_udata *udata, + spinlock_t *producer_lock, + spinlock_t *consumer_lock) +{ + struct rxe_queue *new_q; + unsigned int num_elem = *num_elem_p; + int err; + unsigned long flags = 0, flags1; + + new_q = rxe_queue_init(q->rxe, &num_elem, elem_size); + if (!new_q) + return -ENOMEM; + + err = do_mmap_info(new_q->rxe, udata, false, context, new_q->buf, + new_q->buf_size, &new_q->ip); + if (err) { + vfree(new_q->buf); + kfree(new_q); + goto err1; + } + + spin_lock_irqsave(consumer_lock, flags1); + + if (producer_lock) { + spin_lock_irqsave(producer_lock, flags); + err = resize_finish(q, new_q, num_elem); + spin_unlock_irqrestore(producer_lock, flags); + } else { + err = resize_finish(q, new_q, num_elem); + } + + spin_unlock_irqrestore(consumer_lock, flags1); + + rxe_queue_cleanup(new_q); /* new/old dep on err */ + if (err) + goto err1; + + *num_elem_p = num_elem; + return 0; + +err1: + return err; +} + +void rxe_queue_cleanup(struct rxe_queue *q) +{ + if (q->ip) + kref_put(&q->ip->ref, rxe_mmap_release); + else + vfree(q->buf); + + kfree(q); +} diff --git a/drivers/infiniband/sw/rxe/rxe_queue.h b/drivers/infiniband/sw/rxe/rxe_queue.h new file mode 100644 index 000000000000..239fd609c31e --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_queue.h @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_QUEUE_H +#define RXE_QUEUE_H + +/* implements a simple circular buffer that can optionally be + * shared between user space and the kernel and can be resized + + * the requested element size is rounded up to a power of 2 + * and the number of elements in the buffer is also rounded + * up to a power of 2. Since the queue is empty when the + * producer and consumer indices match the maximum capacity + * of the queue is one less than the number of element slots + */ + +/* this data structure is shared between user space and kernel + * space for those cases where the queue is shared. It contains + * the producer and consumer indices. Is also contains a copy + * of the queue size parameters for user space to use but the + * kernel must use the parameters in the rxe_queue struct + * this MUST MATCH the corresponding librxe struct + * for performance reasons arrange to have producer and consumer + * pointers in separate cache lines + * the kernel should always mask the indices to avoid accessing + * memory outside of the data area + */ +struct rxe_queue_buf { + __u32 log2_elem_size; + __u32 index_mask; + __u32 pad_1[30]; + __u32 producer_index; + __u32 pad_2[31]; + __u32 consumer_index; + __u32 pad_3[31]; + __u8 data[0]; +}; + +struct rxe_queue { + struct rxe_dev *rxe; + struct rxe_queue_buf *buf; + struct rxe_mmap_info *ip; + size_t buf_size; + size_t elem_size; + unsigned int log2_elem_size; + unsigned int index_mask; +}; + +int do_mmap_info(struct rxe_dev *rxe, + struct ib_udata *udata, + bool is_req, + struct ib_ucontext *context, + struct rxe_queue_buf *buf, + size_t buf_size, + struct rxe_mmap_info **ip_p); + +struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe, + int *num_elem, + unsigned int elem_size); + +int rxe_queue_resize(struct rxe_queue *q, + unsigned int *num_elem_p, + unsigned int elem_size, + struct ib_ucontext *context, + struct ib_udata *udata, + /* Protect producers while resizing queue */ + spinlock_t *producer_lock, + /* Protect consumers while resizing queue */ + spinlock_t *consumer_lock); + +void rxe_queue_cleanup(struct rxe_queue *queue); + +static inline int next_index(struct rxe_queue *q, int index) +{ + return (index + 1) & q->buf->index_mask; +} + +static inline int queue_empty(struct rxe_queue *q) +{ + return ((q->buf->producer_index - q->buf->consumer_index) + & q->index_mask) == 0; +} + +static inline int queue_full(struct rxe_queue *q) +{ + return ((q->buf->producer_index + 1 - q->buf->consumer_index) + & q->index_mask) == 0; +} + +static inline void advance_producer(struct rxe_queue *q) +{ + q->buf->producer_index = (q->buf->producer_index + 1) + & q->index_mask; +} + +static inline void advance_consumer(struct rxe_queue *q) +{ + q->buf->consumer_index = (q->buf->consumer_index + 1) + & q->index_mask; +} + +static inline void *producer_addr(struct rxe_queue *q) +{ + return q->buf->data + ((q->buf->producer_index & q->index_mask) + << q->log2_elem_size); +} + +static inline void *consumer_addr(struct rxe_queue *q) +{ + return q->buf->data + ((q->buf->consumer_index & q->index_mask) + << q->log2_elem_size); +} + +static inline unsigned int producer_index(struct rxe_queue *q) +{ + return q->buf->producer_index; +} + +static inline unsigned int consumer_index(struct rxe_queue *q) +{ + return q->buf->consumer_index; +} + +static inline void *addr_from_index(struct rxe_queue *q, unsigned int index) +{ + return q->buf->data + ((index & q->index_mask) + << q->buf->log2_elem_size); +} + +static inline unsigned int index_from_addr(const struct rxe_queue *q, + const void *addr) +{ + return (((u8 *)addr - q->buf->data) >> q->log2_elem_size) + & q->index_mask; +} + +static inline unsigned int queue_count(const struct rxe_queue *q) +{ + return (q->buf->producer_index - q->buf->consumer_index) + & q->index_mask; +} + +static inline void *queue_head(struct rxe_queue *q) +{ + return queue_empty(q) ? NULL : consumer_addr(q); +} + +#endif /* RXE_QUEUE_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c new file mode 100644 index 000000000000..3d464c23e08b --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -0,0 +1,420 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/skbuff.h> + +#include "rxe.h" +#include "rxe_loc.h" + +static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + struct rxe_qp *qp) +{ + if (unlikely(!qp->valid)) + goto err1; + + switch (qp_type(qp)) { + case IB_QPT_RC: + if (unlikely((pkt->opcode & IB_OPCODE_RC) != 0)) { + pr_warn_ratelimited("bad qp type\n"); + goto err1; + } + break; + case IB_QPT_UC: + if (unlikely(!(pkt->opcode & IB_OPCODE_UC))) { + pr_warn_ratelimited("bad qp type\n"); + goto err1; + } + break; + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + if (unlikely(!(pkt->opcode & IB_OPCODE_UD))) { + pr_warn_ratelimited("bad qp type\n"); + goto err1; + } + break; + default: + pr_warn_ratelimited("unsupported qp type\n"); + goto err1; + } + + if (pkt->mask & RXE_REQ_MASK) { + if (unlikely(qp->resp.state != QP_STATE_READY)) + goto err1; + } else if (unlikely(qp->req.state < QP_STATE_READY || + qp->req.state > QP_STATE_DRAINED)) { + goto err1; + } + + return 0; + +err1: + return -EINVAL; +} + +static void set_bad_pkey_cntr(struct rxe_port *port) +{ + spin_lock_bh(&port->port_lock); + port->attr.bad_pkey_cntr = min((u32)0xffff, + port->attr.bad_pkey_cntr + 1); + spin_unlock_bh(&port->port_lock); +} + +static void set_qkey_viol_cntr(struct rxe_port *port) +{ + spin_lock_bh(&port->port_lock); + port->attr.qkey_viol_cntr = min((u32)0xffff, + port->attr.qkey_viol_cntr + 1); + spin_unlock_bh(&port->port_lock); +} + +static int check_keys(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + u32 qpn, struct rxe_qp *qp) +{ + int i; + int found_pkey = 0; + struct rxe_port *port = &rxe->port; + u16 pkey = bth_pkey(pkt); + + pkt->pkey_index = 0; + + if (qpn == 1) { + for (i = 0; i < port->attr.pkey_tbl_len; i++) { + if (pkey_match(pkey, port->pkey_tbl[i])) { + pkt->pkey_index = i; + found_pkey = 1; + break; + } + } + + if (!found_pkey) { + pr_warn_ratelimited("bad pkey = 0x%x\n", pkey); + set_bad_pkey_cntr(port); + goto err1; + } + } else if (qpn != 0) { + if (unlikely(!pkey_match(pkey, + port->pkey_tbl[qp->attr.pkey_index] + ))) { + pr_warn_ratelimited("bad pkey = 0x%0x\n", pkey); + set_bad_pkey_cntr(port); + goto err1; + } + pkt->pkey_index = qp->attr.pkey_index; + } + + if ((qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) && + qpn != 0 && pkt->mask) { + u32 qkey = (qpn == 1) ? GSI_QKEY : qp->attr.qkey; + + if (unlikely(deth_qkey(pkt) != qkey)) { + pr_warn_ratelimited("bad qkey, got 0x%x expected 0x%x for qpn 0x%x\n", + deth_qkey(pkt), qkey, qpn); + set_qkey_viol_cntr(port); + goto err1; + } + } + + return 0; + +err1: + return -EINVAL; +} + +static int check_addr(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + struct rxe_qp *qp) +{ + struct sk_buff *skb = PKT_TO_SKB(pkt); + + if (qp_type(qp) != IB_QPT_RC && qp_type(qp) != IB_QPT_UC) + goto done; + + if (unlikely(pkt->port_num != qp->attr.port_num)) { + pr_warn_ratelimited("port %d != qp port %d\n", + pkt->port_num, qp->attr.port_num); + goto err1; + } + + if (skb->protocol == htons(ETH_P_IP)) { + struct in_addr *saddr = + &qp->pri_av.sgid_addr._sockaddr_in.sin_addr; + struct in_addr *daddr = + &qp->pri_av.dgid_addr._sockaddr_in.sin_addr; + + if (ip_hdr(skb)->daddr != saddr->s_addr) { + pr_warn_ratelimited("dst addr %pI4 != qp source addr %pI4\n", + &ip_hdr(skb)->daddr, + &saddr->s_addr); + goto err1; + } + + if (ip_hdr(skb)->saddr != daddr->s_addr) { + pr_warn_ratelimited("source addr %pI4 != qp dst addr %pI4\n", + &ip_hdr(skb)->saddr, + &daddr->s_addr); + goto err1; + } + + } else if (skb->protocol == htons(ETH_P_IPV6)) { + struct in6_addr *saddr = + &qp->pri_av.sgid_addr._sockaddr_in6.sin6_addr; + struct in6_addr *daddr = + &qp->pri_av.dgid_addr._sockaddr_in6.sin6_addr; + + if (memcmp(&ipv6_hdr(skb)->daddr, saddr, sizeof(*saddr))) { + pr_warn_ratelimited("dst addr %pI6 != qp source addr %pI6\n", + &ipv6_hdr(skb)->daddr, saddr); + goto err1; + } + + if (memcmp(&ipv6_hdr(skb)->saddr, daddr, sizeof(*daddr))) { + pr_warn_ratelimited("source addr %pI6 != qp dst addr %pI6\n", + &ipv6_hdr(skb)->saddr, daddr); + goto err1; + } + } + +done: + return 0; + +err1: + return -EINVAL; +} + +static int hdr_check(struct rxe_pkt_info *pkt) +{ + struct rxe_dev *rxe = pkt->rxe; + struct rxe_port *port = &rxe->port; + struct rxe_qp *qp = NULL; + u32 qpn = bth_qpn(pkt); + int index; + int err; + + if (unlikely(bth_tver(pkt) != BTH_TVER)) { + pr_warn_ratelimited("bad tver\n"); + goto err1; + } + + if (qpn != IB_MULTICAST_QPN) { + index = (qpn == 0) ? port->qp_smi_index : + ((qpn == 1) ? port->qp_gsi_index : qpn); + qp = rxe_pool_get_index(&rxe->qp_pool, index); + if (unlikely(!qp)) { + pr_warn_ratelimited("no qp matches qpn 0x%x\n", qpn); + goto err1; + } + + err = check_type_state(rxe, pkt, qp); + if (unlikely(err)) + goto err2; + + err = check_addr(rxe, pkt, qp); + if (unlikely(err)) + goto err2; + + err = check_keys(rxe, pkt, qpn, qp); + if (unlikely(err)) + goto err2; + } else { + if (unlikely((pkt->mask & RXE_GRH_MASK) == 0)) { + pr_warn_ratelimited("no grh for mcast qpn\n"); + goto err1; + } + } + + pkt->qp = qp; + return 0; + +err2: + if (qp) + rxe_drop_ref(qp); +err1: + return -EINVAL; +} + +static inline void rxe_rcv_pkt(struct rxe_dev *rxe, + struct rxe_pkt_info *pkt, + struct sk_buff *skb) +{ + if (pkt->mask & RXE_REQ_MASK) + rxe_resp_queue_pkt(rxe, pkt->qp, skb); + else + rxe_comp_queue_pkt(rxe, pkt->qp, skb); +} + +static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb) +{ + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + struct rxe_mc_grp *mcg; + struct sk_buff *skb_copy; + struct rxe_mc_elem *mce; + struct rxe_qp *qp; + union ib_gid dgid; + int err; + + if (skb->protocol == htons(ETH_P_IP)) + ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, + (struct in6_addr *)&dgid); + else if (skb->protocol == htons(ETH_P_IPV6)) + memcpy(&dgid, &ipv6_hdr(skb)->daddr, sizeof(dgid)); + + /* lookup mcast group corresponding to mgid, takes a ref */ + mcg = rxe_pool_get_key(&rxe->mc_grp_pool, &dgid); + if (!mcg) + goto err1; /* mcast group not registered */ + + spin_lock_bh(&mcg->mcg_lock); + + list_for_each_entry(mce, &mcg->qp_list, qp_list) { + qp = mce->qp; + pkt = SKB_TO_PKT(skb); + + /* validate qp for incoming packet */ + err = check_type_state(rxe, pkt, qp); + if (err) + continue; + + err = check_keys(rxe, pkt, bth_qpn(pkt), qp); + if (err) + continue; + + /* if *not* the last qp in the list + * make a copy of the skb to post to the next qp + */ + skb_copy = (mce->qp_list.next != &mcg->qp_list) ? + skb_clone(skb, GFP_KERNEL) : NULL; + + pkt->qp = qp; + rxe_add_ref(qp); + rxe_rcv_pkt(rxe, pkt, skb); + + skb = skb_copy; + if (!skb) + break; + } + + spin_unlock_bh(&mcg->mcg_lock); + + rxe_drop_ref(mcg); /* drop ref from rxe_pool_get_key. */ + +err1: + if (skb) + kfree_skb(skb); +} + +static int rxe_match_dgid(struct rxe_dev *rxe, struct sk_buff *skb) +{ + union ib_gid dgid; + union ib_gid *pdgid; + u16 index; + + if (skb->protocol == htons(ETH_P_IP)) { + ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, + (struct in6_addr *)&dgid); + pdgid = &dgid; + } else { + pdgid = (union ib_gid *)&ipv6_hdr(skb)->daddr; + } + + return ib_find_cached_gid_by_port(&rxe->ib_dev, pdgid, + IB_GID_TYPE_ROCE_UDP_ENCAP, + 1, rxe->ndev, &index); +} + +/* rxe_rcv is called from the interface driver */ +int rxe_rcv(struct sk_buff *skb) +{ + int err; + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + struct rxe_dev *rxe = pkt->rxe; + __be32 *icrcp; + u32 calc_icrc, pack_icrc; + + pkt->offset = 0; + + if (unlikely(skb->len < pkt->offset + RXE_BTH_BYTES)) + goto drop; + + if (unlikely(rxe_match_dgid(rxe, skb) < 0)) { + pr_warn_ratelimited("failed matching dgid\n"); + goto drop; + } + + pkt->opcode = bth_opcode(pkt); + pkt->psn = bth_psn(pkt); + pkt->qp = NULL; + pkt->mask |= rxe_opcode[pkt->opcode].mask; + + if (unlikely(skb->len < header_size(pkt))) + goto drop; + + err = hdr_check(pkt); + if (unlikely(err)) + goto drop; + + /* Verify ICRC */ + icrcp = (__be32 *)(pkt->hdr + pkt->paylen - RXE_ICRC_SIZE); + pack_icrc = be32_to_cpu(*icrcp); + + calc_icrc = rxe_icrc_hdr(pkt, skb); + calc_icrc = crc32_le(calc_icrc, (u8 *)payload_addr(pkt), payload_size(pkt)); + calc_icrc = cpu_to_be32(~calc_icrc); + if (unlikely(calc_icrc != pack_icrc)) { + char saddr[sizeof(struct in6_addr)]; + + if (skb->protocol == htons(ETH_P_IPV6)) + sprintf(saddr, "%pI6", &ipv6_hdr(skb)->saddr); + else if (skb->protocol == htons(ETH_P_IP)) + sprintf(saddr, "%pI4", &ip_hdr(skb)->saddr); + else + sprintf(saddr, "unknown"); + + pr_warn_ratelimited("bad ICRC from %s\n", saddr); + goto drop; + } + + if (unlikely(bth_qpn(pkt) == IB_MULTICAST_QPN)) + rxe_rcv_mcast_pkt(rxe, skb); + else + rxe_rcv_pkt(rxe, pkt, skb); + + return 0; + +drop: + if (pkt->qp) + rxe_drop_ref(pkt->qp); + + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL(rxe_rcv); diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c new file mode 100644 index 000000000000..33b2d9d77021 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -0,0 +1,726 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/skbuff.h> + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + unsigned opcode); + +static inline void retry_first_write_send(struct rxe_qp *qp, + struct rxe_send_wqe *wqe, + unsigned mask, int npsn) +{ + int i; + + for (i = 0; i < npsn; i++) { + int to_send = (wqe->dma.resid > qp->mtu) ? + qp->mtu : wqe->dma.resid; + + qp->req.opcode = next_opcode(qp, wqe, + wqe->wr.opcode); + + if (wqe->wr.send_flags & IB_SEND_INLINE) { + wqe->dma.resid -= to_send; + wqe->dma.sge_offset += to_send; + } else { + advance_dma_data(&wqe->dma, to_send); + } + if (mask & WR_WRITE_MASK) + wqe->iova += qp->mtu; + } +} + +static void req_retry(struct rxe_qp *qp) +{ + struct rxe_send_wqe *wqe; + unsigned int wqe_index; + unsigned int mask; + int npsn; + int first = 1; + + wqe = queue_head(qp->sq.queue); + npsn = (qp->comp.psn - wqe->first_psn) & BTH_PSN_MASK; + + qp->req.wqe_index = consumer_index(qp->sq.queue); + qp->req.psn = qp->comp.psn; + qp->req.opcode = -1; + + for (wqe_index = consumer_index(qp->sq.queue); + wqe_index != producer_index(qp->sq.queue); + wqe_index = next_index(qp->sq.queue, wqe_index)) { + wqe = addr_from_index(qp->sq.queue, wqe_index); + mask = wr_opcode_mask(wqe->wr.opcode, qp); + + if (wqe->state == wqe_state_posted) + break; + + if (wqe->state == wqe_state_done) + continue; + + wqe->iova = (mask & WR_ATOMIC_MASK) ? + wqe->wr.wr.atomic.remote_addr : + (mask & WR_READ_OR_WRITE_MASK) ? + wqe->wr.wr.rdma.remote_addr : + 0; + + if (!first || (mask & WR_READ_MASK) == 0) { + wqe->dma.resid = wqe->dma.length; + wqe->dma.cur_sge = 0; + wqe->dma.sge_offset = 0; + } + + if (first) { + first = 0; + + if (mask & WR_WRITE_OR_SEND_MASK) + retry_first_write_send(qp, wqe, mask, npsn); + + if (mask & WR_READ_MASK) + wqe->iova += npsn * qp->mtu; + } + + wqe->state = wqe_state_posted; + } +} + +void rnr_nak_timer(unsigned long data) +{ + struct rxe_qp *qp = (struct rxe_qp *)data; + + pr_debug("rnr nak timer fired\n"); + rxe_run_task(&qp->req.task, 1); +} + +static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) +{ + struct rxe_send_wqe *wqe = queue_head(qp->sq.queue); + unsigned long flags; + + if (unlikely(qp->req.state == QP_STATE_DRAIN)) { + /* check to see if we are drained; + * state_lock used by requester and completer + */ + spin_lock_irqsave(&qp->state_lock, flags); + do { + if (qp->req.state != QP_STATE_DRAIN) { + /* comp just finished */ + spin_unlock_irqrestore(&qp->state_lock, + flags); + break; + } + + if (wqe && ((qp->req.wqe_index != + consumer_index(qp->sq.queue)) || + (wqe->state != wqe_state_posted))) { + /* comp not done yet */ + spin_unlock_irqrestore(&qp->state_lock, + flags); + break; + } + + qp->req.state = QP_STATE_DRAINED; + spin_unlock_irqrestore(&qp->state_lock, flags); + + if (qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_SQ_DRAINED; + qp->ibqp.event_handler(&ev, + qp->ibqp.qp_context); + } + } while (0); + } + + if (qp->req.wqe_index == producer_index(qp->sq.queue)) + return NULL; + + wqe = addr_from_index(qp->sq.queue, qp->req.wqe_index); + + if (unlikely((qp->req.state == QP_STATE_DRAIN || + qp->req.state == QP_STATE_DRAINED) && + (wqe->state != wqe_state_processing))) + return NULL; + + if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) && + (qp->req.wqe_index != consumer_index(qp->sq.queue)))) { + qp->req.wait_fence = 1; + return NULL; + } + + wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp); + return wqe; +} + +static int next_opcode_rc(struct rxe_qp *qp, unsigned opcode, int fits) +{ + switch (opcode) { + case IB_WR_RDMA_WRITE: + if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST || + qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE) + return fits ? + IB_OPCODE_RC_RDMA_WRITE_LAST : + IB_OPCODE_RC_RDMA_WRITE_MIDDLE; + else + return fits ? + IB_OPCODE_RC_RDMA_WRITE_ONLY : + IB_OPCODE_RC_RDMA_WRITE_FIRST; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST || + qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE) + return fits ? + IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE : + IB_OPCODE_RC_RDMA_WRITE_MIDDLE; + else + return fits ? + IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE : + IB_OPCODE_RC_RDMA_WRITE_FIRST; + + case IB_WR_SEND: + if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE) + return fits ? + IB_OPCODE_RC_SEND_LAST : + IB_OPCODE_RC_SEND_MIDDLE; + else + return fits ? + IB_OPCODE_RC_SEND_ONLY : + IB_OPCODE_RC_SEND_FIRST; + + case IB_WR_SEND_WITH_IMM: + if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE) + return fits ? + IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE : + IB_OPCODE_RC_SEND_MIDDLE; + else + return fits ? + IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE : + IB_OPCODE_RC_SEND_FIRST; + + case IB_WR_RDMA_READ: + return IB_OPCODE_RC_RDMA_READ_REQUEST; + + case IB_WR_ATOMIC_CMP_AND_SWP: + return IB_OPCODE_RC_COMPARE_SWAP; + + case IB_WR_ATOMIC_FETCH_AND_ADD: + return IB_OPCODE_RC_FETCH_ADD; + + case IB_WR_SEND_WITH_INV: + if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE) + return fits ? IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE : + IB_OPCODE_RC_SEND_MIDDLE; + else + return fits ? IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE : + IB_OPCODE_RC_SEND_FIRST; + case IB_WR_REG_MR: + case IB_WR_LOCAL_INV: + return opcode; + } + + return -EINVAL; +} + +static int next_opcode_uc(struct rxe_qp *qp, unsigned opcode, int fits) +{ + switch (opcode) { + case IB_WR_RDMA_WRITE: + if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST || + qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE) + return fits ? + IB_OPCODE_UC_RDMA_WRITE_LAST : + IB_OPCODE_UC_RDMA_WRITE_MIDDLE; + else + return fits ? + IB_OPCODE_UC_RDMA_WRITE_ONLY : + IB_OPCODE_UC_RDMA_WRITE_FIRST; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST || + qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE) + return fits ? + IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE : + IB_OPCODE_UC_RDMA_WRITE_MIDDLE; + else + return fits ? + IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE : + IB_OPCODE_UC_RDMA_WRITE_FIRST; + + case IB_WR_SEND: + if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE) + return fits ? + IB_OPCODE_UC_SEND_LAST : + IB_OPCODE_UC_SEND_MIDDLE; + else + return fits ? + IB_OPCODE_UC_SEND_ONLY : + IB_OPCODE_UC_SEND_FIRST; + + case IB_WR_SEND_WITH_IMM: + if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE) + return fits ? + IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE : + IB_OPCODE_UC_SEND_MIDDLE; + else + return fits ? + IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE : + IB_OPCODE_UC_SEND_FIRST; + } + + return -EINVAL; +} + +static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + unsigned opcode) +{ + int fits = (wqe->dma.resid <= qp->mtu); + + switch (qp_type(qp)) { + case IB_QPT_RC: + return next_opcode_rc(qp, opcode, fits); + + case IB_QPT_UC: + return next_opcode_uc(qp, opcode, fits); + + case IB_QPT_SMI: + case IB_QPT_UD: + case IB_QPT_GSI: + switch (opcode) { + case IB_WR_SEND: + return IB_OPCODE_UD_SEND_ONLY; + + case IB_WR_SEND_WITH_IMM: + return IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + } + break; + + default: + break; + } + + return -EINVAL; +} + +static inline int check_init_depth(struct rxe_qp *qp, struct rxe_send_wqe *wqe) +{ + int depth; + + if (wqe->has_rd_atomic) + return 0; + + qp->req.need_rd_atomic = 1; + depth = atomic_dec_return(&qp->req.rd_atomic); + + if (depth >= 0) { + qp->req.need_rd_atomic = 0; + wqe->has_rd_atomic = 1; + return 0; + } + + atomic_inc(&qp->req.rd_atomic); + return -EAGAIN; +} + +static inline int get_mtu(struct rxe_qp *qp, struct rxe_send_wqe *wqe) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_port *port; + struct rxe_av *av; + + if ((qp_type(qp) == IB_QPT_RC) || (qp_type(qp) == IB_QPT_UC)) + return qp->mtu; + + av = &wqe->av; + port = &rxe->port; + + return port->mtu_cap; +} + +static struct sk_buff *init_req_packet(struct rxe_qp *qp, + struct rxe_send_wqe *wqe, + int opcode, int payload, + struct rxe_pkt_info *pkt) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_port *port = &rxe->port; + struct sk_buff *skb; + struct rxe_send_wr *ibwr = &wqe->wr; + struct rxe_av *av; + int pad = (-payload) & 0x3; + int paylen; + int solicited; + u16 pkey; + u32 qp_num; + int ack_req; + + /* length from start of bth to end of icrc */ + paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE; + + /* pkt->hdr, rxe, port_num and mask are initialized in ifc + * layer + */ + pkt->opcode = opcode; + pkt->qp = qp; + pkt->psn = qp->req.psn; + pkt->mask = rxe_opcode[opcode].mask; + pkt->paylen = paylen; + pkt->offset = 0; + pkt->wqe = wqe; + + /* init skb */ + av = rxe_get_av(pkt); + skb = rxe->ifc_ops->init_packet(rxe, av, paylen, pkt); + if (unlikely(!skb)) + return NULL; + + /* init bth */ + solicited = (ibwr->send_flags & IB_SEND_SOLICITED) && + (pkt->mask & RXE_END_MASK) && + ((pkt->mask & (RXE_SEND_MASK)) || + (pkt->mask & (RXE_WRITE_MASK | RXE_IMMDT_MASK)) == + (RXE_WRITE_MASK | RXE_IMMDT_MASK)); + + pkey = (qp_type(qp) == IB_QPT_GSI) ? + port->pkey_tbl[ibwr->wr.ud.pkey_index] : + port->pkey_tbl[qp->attr.pkey_index]; + + qp_num = (pkt->mask & RXE_DETH_MASK) ? ibwr->wr.ud.remote_qpn : + qp->attr.dest_qp_num; + + ack_req = ((pkt->mask & RXE_END_MASK) || + (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK)); + if (ack_req) + qp->req.noack_pkts = 0; + + bth_init(pkt, pkt->opcode, solicited, 0, pad, pkey, qp_num, + ack_req, pkt->psn); + + /* init optional headers */ + if (pkt->mask & RXE_RETH_MASK) { + reth_set_rkey(pkt, ibwr->wr.rdma.rkey); + reth_set_va(pkt, wqe->iova); + reth_set_len(pkt, wqe->dma.length); + } + + if (pkt->mask & RXE_IMMDT_MASK) + immdt_set_imm(pkt, ibwr->ex.imm_data); + + if (pkt->mask & RXE_IETH_MASK) + ieth_set_rkey(pkt, ibwr->ex.invalidate_rkey); + + if (pkt->mask & RXE_ATMETH_MASK) { + atmeth_set_va(pkt, wqe->iova); + if (opcode == IB_OPCODE_RC_COMPARE_SWAP || + opcode == IB_OPCODE_RD_COMPARE_SWAP) { + atmeth_set_swap_add(pkt, ibwr->wr.atomic.swap); + atmeth_set_comp(pkt, ibwr->wr.atomic.compare_add); + } else { + atmeth_set_swap_add(pkt, ibwr->wr.atomic.compare_add); + } + atmeth_set_rkey(pkt, ibwr->wr.atomic.rkey); + } + + if (pkt->mask & RXE_DETH_MASK) { + if (qp->ibqp.qp_num == 1) + deth_set_qkey(pkt, GSI_QKEY); + else + deth_set_qkey(pkt, ibwr->wr.ud.remote_qkey); + deth_set_sqp(pkt, qp->ibqp.qp_num); + } + + return skb; +} + +static int fill_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + struct rxe_pkt_info *pkt, struct sk_buff *skb, + int paylen) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + u32 crc = 0; + u32 *p; + int err; + + err = rxe->ifc_ops->prepare(rxe, pkt, skb, &crc); + if (err) + return err; + + if (pkt->mask & RXE_WRITE_OR_SEND) { + if (wqe->wr.send_flags & IB_SEND_INLINE) { + u8 *tmp = &wqe->dma.inline_data[wqe->dma.sge_offset]; + + crc = crc32_le(crc, tmp, paylen); + + memcpy(payload_addr(pkt), tmp, paylen); + + wqe->dma.resid -= paylen; + wqe->dma.sge_offset += paylen; + } else { + err = copy_data(rxe, qp->pd, 0, &wqe->dma, + payload_addr(pkt), paylen, + from_mem_obj, + &crc); + if (err) + return err; + } + } + p = payload_addr(pkt) + paylen + bth_pad(pkt); + + *p = ~crc; + + return 0; +} + +static void update_wqe_state(struct rxe_qp *qp, + struct rxe_send_wqe *wqe, + struct rxe_pkt_info *pkt, + enum wqe_state *prev_state) +{ + enum wqe_state prev_state_ = wqe->state; + + if (pkt->mask & RXE_END_MASK) { + if (qp_type(qp) == IB_QPT_RC) + wqe->state = wqe_state_pending; + } else { + wqe->state = wqe_state_processing; + } + + *prev_state = prev_state_; +} + +static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + struct rxe_pkt_info *pkt, int payload) +{ + /* number of packets left to send including current one */ + int num_pkt = (wqe->dma.resid + payload + qp->mtu - 1) / qp->mtu; + + /* handle zero length packet case */ + if (num_pkt == 0) + num_pkt = 1; + + if (pkt->mask & RXE_START_MASK) { + wqe->first_psn = qp->req.psn; + wqe->last_psn = (qp->req.psn + num_pkt - 1) & BTH_PSN_MASK; + } + + if (pkt->mask & RXE_READ_MASK) + qp->req.psn = (wqe->first_psn + num_pkt) & BTH_PSN_MASK; + else + qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK; + + qp->req.opcode = pkt->opcode; + + + if (pkt->mask & RXE_END_MASK) + qp->req.wqe_index = next_index(qp->sq.queue, qp->req.wqe_index); + + qp->need_req_skb = 0; + + if (qp->qp_timeout_jiffies && !timer_pending(&qp->retrans_timer)) + mod_timer(&qp->retrans_timer, + jiffies + qp->qp_timeout_jiffies); +} + +int rxe_requester(void *arg) +{ + struct rxe_qp *qp = (struct rxe_qp *)arg; + struct rxe_pkt_info pkt; + struct sk_buff *skb; + struct rxe_send_wqe *wqe; + unsigned mask; + int payload; + int mtu; + int opcode; + int ret; + enum wqe_state prev_state; + +next_wqe: + if (unlikely(!qp->valid || qp->req.state == QP_STATE_ERROR)) + goto exit; + + if (unlikely(qp->req.state == QP_STATE_RESET)) { + qp->req.wqe_index = consumer_index(qp->sq.queue); + qp->req.opcode = -1; + qp->req.need_rd_atomic = 0; + qp->req.wait_psn = 0; + qp->req.need_retry = 0; + goto exit; + } + + if (unlikely(qp->req.need_retry)) { + req_retry(qp); + qp->req.need_retry = 0; + } + + wqe = req_next_wqe(qp); + if (unlikely(!wqe)) + goto exit; + + if (wqe->mask & WR_REG_MASK) { + if (wqe->wr.opcode == IB_WR_LOCAL_INV) { + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_mem *rmr; + + rmr = rxe_pool_get_index(&rxe->mr_pool, + wqe->wr.ex.invalidate_rkey >> 8); + if (!rmr) { + pr_err("No mr for key %#x\n", wqe->wr.ex.invalidate_rkey); + wqe->state = wqe_state_error; + wqe->status = IB_WC_MW_BIND_ERR; + goto exit; + } + rmr->state = RXE_MEM_STATE_FREE; + wqe->state = wqe_state_done; + wqe->status = IB_WC_SUCCESS; + } else if (wqe->wr.opcode == IB_WR_REG_MR) { + struct rxe_mem *rmr = to_rmr(wqe->wr.wr.reg.mr); + + rmr->state = RXE_MEM_STATE_VALID; + rmr->access = wqe->wr.wr.reg.access; + rmr->lkey = wqe->wr.wr.reg.key; + rmr->rkey = wqe->wr.wr.reg.key; + wqe->state = wqe_state_done; + wqe->status = IB_WC_SUCCESS; + } else { + goto exit; + } + qp->req.wqe_index = next_index(qp->sq.queue, + qp->req.wqe_index); + goto next_wqe; + } + + if (unlikely(qp_type(qp) == IB_QPT_RC && + qp->req.psn > (qp->comp.psn + RXE_MAX_UNACKED_PSNS))) { + qp->req.wait_psn = 1; + goto exit; + } + + /* Limit the number of inflight SKBs per QP */ + if (unlikely(atomic_read(&qp->skb_out) > + RXE_INFLIGHT_SKBS_PER_QP_HIGH)) { + qp->need_req_skb = 1; + goto exit; + } + + opcode = next_opcode(qp, wqe, wqe->wr.opcode); + if (unlikely(opcode < 0)) { + wqe->status = IB_WC_LOC_QP_OP_ERR; + goto exit; + } + + mask = rxe_opcode[opcode].mask; + if (unlikely(mask & RXE_READ_OR_ATOMIC)) { + if (check_init_depth(qp, wqe)) + goto exit; + } + + mtu = get_mtu(qp, wqe); + payload = (mask & RXE_WRITE_OR_SEND) ? wqe->dma.resid : 0; + if (payload > mtu) { + if (qp_type(qp) == IB_QPT_UD) { + /* C10-93.1.1: If the total sum of all the buffer lengths specified for a + * UD message exceeds the MTU of the port as returned by QueryHCA, the CI + * shall not emit any packets for this message. Further, the CI shall not + * generate an error due to this condition. + */ + + /* fake a successful UD send */ + wqe->first_psn = qp->req.psn; + wqe->last_psn = qp->req.psn; + qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK; + qp->req.opcode = IB_OPCODE_UD_SEND_ONLY; + qp->req.wqe_index = next_index(qp->sq.queue, + qp->req.wqe_index); + wqe->state = wqe_state_done; + wqe->status = IB_WC_SUCCESS; + goto complete; + } + payload = mtu; + } + + skb = init_req_packet(qp, wqe, opcode, payload, &pkt); + if (unlikely(!skb)) { + pr_err("Failed allocating skb\n"); + goto err; + } + + if (fill_packet(qp, wqe, &pkt, skb, payload)) { + pr_debug("Error during fill packet\n"); + goto err; + } + + update_wqe_state(qp, wqe, &pkt, &prev_state); + ret = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp, &pkt, skb); + if (ret) { + qp->need_req_skb = 1; + kfree_skb(skb); + + wqe->state = prev_state; + + if (ret == -EAGAIN) { + rxe_run_task(&qp->req.task, 1); + goto exit; + } + + goto err; + } + + update_state(qp, wqe, &pkt, payload); + + goto next_wqe; + +err: + kfree_skb(skb); + wqe->status = IB_WC_LOC_PROT_ERR; + wqe->state = wqe_state_error; + +complete: + if (qp_type(qp) != IB_QPT_RC) { + while (rxe_completer(qp) == 0) + ; + } + + return 0; + +exit: + return -EAGAIN; +} diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c new file mode 100644 index 000000000000..ebb03b46e2ad --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -0,0 +1,1380 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/skbuff.h> + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +enum resp_states { + RESPST_NONE, + RESPST_GET_REQ, + RESPST_CHK_PSN, + RESPST_CHK_OP_SEQ, + RESPST_CHK_OP_VALID, + RESPST_CHK_RESOURCE, + RESPST_CHK_LENGTH, + RESPST_CHK_RKEY, + RESPST_EXECUTE, + RESPST_READ_REPLY, + RESPST_COMPLETE, + RESPST_ACKNOWLEDGE, + RESPST_CLEANUP, + RESPST_DUPLICATE_REQUEST, + RESPST_ERR_MALFORMED_WQE, + RESPST_ERR_UNSUPPORTED_OPCODE, + RESPST_ERR_MISALIGNED_ATOMIC, + RESPST_ERR_PSN_OUT_OF_SEQ, + RESPST_ERR_MISSING_OPCODE_FIRST, + RESPST_ERR_MISSING_OPCODE_LAST_C, + RESPST_ERR_MISSING_OPCODE_LAST_D1E, + RESPST_ERR_TOO_MANY_RDMA_ATM_REQ, + RESPST_ERR_RNR, + RESPST_ERR_RKEY_VIOLATION, + RESPST_ERR_LENGTH, + RESPST_ERR_CQ_OVERFLOW, + RESPST_ERROR, + RESPST_RESET, + RESPST_DONE, + RESPST_EXIT, +}; + +static char *resp_state_name[] = { + [RESPST_NONE] = "NONE", + [RESPST_GET_REQ] = "GET_REQ", + [RESPST_CHK_PSN] = "CHK_PSN", + [RESPST_CHK_OP_SEQ] = "CHK_OP_SEQ", + [RESPST_CHK_OP_VALID] = "CHK_OP_VALID", + [RESPST_CHK_RESOURCE] = "CHK_RESOURCE", + [RESPST_CHK_LENGTH] = "CHK_LENGTH", + [RESPST_CHK_RKEY] = "CHK_RKEY", + [RESPST_EXECUTE] = "EXECUTE", + [RESPST_READ_REPLY] = "READ_REPLY", + [RESPST_COMPLETE] = "COMPLETE", + [RESPST_ACKNOWLEDGE] = "ACKNOWLEDGE", + [RESPST_CLEANUP] = "CLEANUP", + [RESPST_DUPLICATE_REQUEST] = "DUPLICATE_REQUEST", + [RESPST_ERR_MALFORMED_WQE] = "ERR_MALFORMED_WQE", + [RESPST_ERR_UNSUPPORTED_OPCODE] = "ERR_UNSUPPORTED_OPCODE", + [RESPST_ERR_MISALIGNED_ATOMIC] = "ERR_MISALIGNED_ATOMIC", + [RESPST_ERR_PSN_OUT_OF_SEQ] = "ERR_PSN_OUT_OF_SEQ", + [RESPST_ERR_MISSING_OPCODE_FIRST] = "ERR_MISSING_OPCODE_FIRST", + [RESPST_ERR_MISSING_OPCODE_LAST_C] = "ERR_MISSING_OPCODE_LAST_C", + [RESPST_ERR_MISSING_OPCODE_LAST_D1E] = "ERR_MISSING_OPCODE_LAST_D1E", + [RESPST_ERR_TOO_MANY_RDMA_ATM_REQ] = "ERR_TOO_MANY_RDMA_ATM_REQ", + [RESPST_ERR_RNR] = "ERR_RNR", + [RESPST_ERR_RKEY_VIOLATION] = "ERR_RKEY_VIOLATION", + [RESPST_ERR_LENGTH] = "ERR_LENGTH", + [RESPST_ERR_CQ_OVERFLOW] = "ERR_CQ_OVERFLOW", + [RESPST_ERROR] = "ERROR", + [RESPST_RESET] = "RESET", + [RESPST_DONE] = "DONE", + [RESPST_EXIT] = "EXIT", +}; + +/* rxe_recv calls here to add a request packet to the input queue */ +void rxe_resp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp, + struct sk_buff *skb) +{ + int must_sched; + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + + skb_queue_tail(&qp->req_pkts, skb); + + must_sched = (pkt->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST) || + (skb_queue_len(&qp->req_pkts) > 1); + + rxe_run_task(&qp->resp.task, must_sched); +} + +static inline enum resp_states get_req(struct rxe_qp *qp, + struct rxe_pkt_info **pkt_p) +{ + struct sk_buff *skb; + + if (qp->resp.state == QP_STATE_ERROR) { + skb = skb_dequeue(&qp->req_pkts); + if (skb) { + /* drain request packet queue */ + rxe_drop_ref(qp); + kfree_skb(skb); + return RESPST_GET_REQ; + } + + /* go drain recv wr queue */ + return RESPST_CHK_RESOURCE; + } + + skb = skb_peek(&qp->req_pkts); + if (!skb) + return RESPST_EXIT; + + *pkt_p = SKB_TO_PKT(skb); + + return (qp->resp.res) ? RESPST_READ_REPLY : RESPST_CHK_PSN; +} + +static enum resp_states check_psn(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + int diff = psn_compare(pkt->psn, qp->resp.psn); + + switch (qp_type(qp)) { + case IB_QPT_RC: + if (diff > 0) { + if (qp->resp.sent_psn_nak) + return RESPST_CLEANUP; + + qp->resp.sent_psn_nak = 1; + return RESPST_ERR_PSN_OUT_OF_SEQ; + + } else if (diff < 0) { + return RESPST_DUPLICATE_REQUEST; + } + + if (qp->resp.sent_psn_nak) + qp->resp.sent_psn_nak = 0; + + break; + + case IB_QPT_UC: + if (qp->resp.drop_msg || diff != 0) { + if (pkt->mask & RXE_START_MASK) { + qp->resp.drop_msg = 0; + return RESPST_CHK_OP_SEQ; + } + + qp->resp.drop_msg = 1; + return RESPST_CLEANUP; + } + break; + default: + break; + } + + return RESPST_CHK_OP_SEQ; +} + +static enum resp_states check_op_seq(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + switch (qp_type(qp)) { + case IB_QPT_RC: + switch (qp->resp.opcode) { + case IB_OPCODE_RC_SEND_FIRST: + case IB_OPCODE_RC_SEND_MIDDLE: + switch (pkt->opcode) { + case IB_OPCODE_RC_SEND_MIDDLE: + case IB_OPCODE_RC_SEND_LAST: + case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE: + case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE: + return RESPST_CHK_OP_VALID; + default: + return RESPST_ERR_MISSING_OPCODE_LAST_C; + } + + case IB_OPCODE_RC_RDMA_WRITE_FIRST: + case IB_OPCODE_RC_RDMA_WRITE_MIDDLE: + switch (pkt->opcode) { + case IB_OPCODE_RC_RDMA_WRITE_MIDDLE: + case IB_OPCODE_RC_RDMA_WRITE_LAST: + case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE: + return RESPST_CHK_OP_VALID; + default: + return RESPST_ERR_MISSING_OPCODE_LAST_C; + } + + default: + switch (pkt->opcode) { + case IB_OPCODE_RC_SEND_MIDDLE: + case IB_OPCODE_RC_SEND_LAST: + case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE: + case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE: + case IB_OPCODE_RC_RDMA_WRITE_MIDDLE: + case IB_OPCODE_RC_RDMA_WRITE_LAST: + case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE: + return RESPST_ERR_MISSING_OPCODE_FIRST; + default: + return RESPST_CHK_OP_VALID; + } + } + break; + + case IB_QPT_UC: + switch (qp->resp.opcode) { + case IB_OPCODE_UC_SEND_FIRST: + case IB_OPCODE_UC_SEND_MIDDLE: + switch (pkt->opcode) { + case IB_OPCODE_UC_SEND_MIDDLE: + case IB_OPCODE_UC_SEND_LAST: + case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE: + return RESPST_CHK_OP_VALID; + default: + return RESPST_ERR_MISSING_OPCODE_LAST_D1E; + } + + case IB_OPCODE_UC_RDMA_WRITE_FIRST: + case IB_OPCODE_UC_RDMA_WRITE_MIDDLE: + switch (pkt->opcode) { + case IB_OPCODE_UC_RDMA_WRITE_MIDDLE: + case IB_OPCODE_UC_RDMA_WRITE_LAST: + case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE: + return RESPST_CHK_OP_VALID; + default: + return RESPST_ERR_MISSING_OPCODE_LAST_D1E; + } + + default: + switch (pkt->opcode) { + case IB_OPCODE_UC_SEND_MIDDLE: + case IB_OPCODE_UC_SEND_LAST: + case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE: + case IB_OPCODE_UC_RDMA_WRITE_MIDDLE: + case IB_OPCODE_UC_RDMA_WRITE_LAST: + case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE: + qp->resp.drop_msg = 1; + return RESPST_CLEANUP; + default: + return RESPST_CHK_OP_VALID; + } + } + break; + + default: + return RESPST_CHK_OP_VALID; + } +} + +static enum resp_states check_op_valid(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + switch (qp_type(qp)) { + case IB_QPT_RC: + if (((pkt->mask & RXE_READ_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) || + ((pkt->mask & RXE_WRITE_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) || + ((pkt->mask & RXE_ATOMIC_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) { + return RESPST_ERR_UNSUPPORTED_OPCODE; + } + + break; + + case IB_QPT_UC: + if ((pkt->mask & RXE_WRITE_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) { + qp->resp.drop_msg = 1; + return RESPST_CLEANUP; + } + + break; + + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + break; + + default: + WARN_ON(1); + break; + } + + return RESPST_CHK_RESOURCE; +} + +static enum resp_states get_srq_wqe(struct rxe_qp *qp) +{ + struct rxe_srq *srq = qp->srq; + struct rxe_queue *q = srq->rq.queue; + struct rxe_recv_wqe *wqe; + struct ib_event ev; + + if (srq->error) + return RESPST_ERR_RNR; + + spin_lock_bh(&srq->rq.consumer_lock); + + wqe = queue_head(q); + if (!wqe) { + spin_unlock_bh(&srq->rq.consumer_lock); + return RESPST_ERR_RNR; + } + + /* note kernel and user space recv wqes have same size */ + memcpy(&qp->resp.srq_wqe, wqe, sizeof(qp->resp.srq_wqe)); + + qp->resp.wqe = &qp->resp.srq_wqe.wqe; + advance_consumer(q); + + if (srq->limit && srq->ibsrq.event_handler && + (queue_count(q) < srq->limit)) { + srq->limit = 0; + goto event; + } + + spin_unlock_bh(&srq->rq.consumer_lock); + return RESPST_CHK_LENGTH; + +event: + spin_unlock_bh(&srq->rq.consumer_lock); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + srq->ibsrq.event_handler(&ev, srq->ibsrq.srq_context); + return RESPST_CHK_LENGTH; +} + +static enum resp_states check_resource(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct rxe_srq *srq = qp->srq; + + if (qp->resp.state == QP_STATE_ERROR) { + if (qp->resp.wqe) { + qp->resp.status = IB_WC_WR_FLUSH_ERR; + return RESPST_COMPLETE; + } else if (!srq) { + qp->resp.wqe = queue_head(qp->rq.queue); + if (qp->resp.wqe) { + qp->resp.status = IB_WC_WR_FLUSH_ERR; + return RESPST_COMPLETE; + } else { + return RESPST_EXIT; + } + } else { + return RESPST_EXIT; + } + } + + if (pkt->mask & RXE_READ_OR_ATOMIC) { + /* it is the requesters job to not send + * too many read/atomic ops, we just + * recycle the responder resource queue + */ + if (likely(qp->attr.max_rd_atomic > 0)) + return RESPST_CHK_LENGTH; + else + return RESPST_ERR_TOO_MANY_RDMA_ATM_REQ; + } + + if (pkt->mask & RXE_RWR_MASK) { + if (srq) + return get_srq_wqe(qp); + + qp->resp.wqe = queue_head(qp->rq.queue); + return (qp->resp.wqe) ? RESPST_CHK_LENGTH : RESPST_ERR_RNR; + } + + return RESPST_CHK_LENGTH; +} + +static enum resp_states check_length(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + switch (qp_type(qp)) { + case IB_QPT_RC: + return RESPST_CHK_RKEY; + + case IB_QPT_UC: + return RESPST_CHK_RKEY; + + default: + return RESPST_CHK_RKEY; + } +} + +static enum resp_states check_rkey(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct rxe_mem *mem; + u64 va; + u32 rkey; + u32 resid; + u32 pktlen; + int mtu = qp->mtu; + enum resp_states state; + int access; + + if (pkt->mask & (RXE_READ_MASK | RXE_WRITE_MASK)) { + if (pkt->mask & RXE_RETH_MASK) { + qp->resp.va = reth_va(pkt); + qp->resp.rkey = reth_rkey(pkt); + qp->resp.resid = reth_len(pkt); + } + access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ + : IB_ACCESS_REMOTE_WRITE; + } else if (pkt->mask & RXE_ATOMIC_MASK) { + qp->resp.va = atmeth_va(pkt); + qp->resp.rkey = atmeth_rkey(pkt); + qp->resp.resid = sizeof(u64); + access = IB_ACCESS_REMOTE_ATOMIC; + } else { + return RESPST_EXECUTE; + } + + va = qp->resp.va; + rkey = qp->resp.rkey; + resid = qp->resp.resid; + pktlen = payload_size(pkt); + + mem = lookup_mem(qp->pd, access, rkey, lookup_remote); + if (!mem) { + state = RESPST_ERR_RKEY_VIOLATION; + goto err1; + } + + if (unlikely(mem->state == RXE_MEM_STATE_FREE)) { + state = RESPST_ERR_RKEY_VIOLATION; + goto err1; + } + + if (mem_check_range(mem, va, resid)) { + state = RESPST_ERR_RKEY_VIOLATION; + goto err2; + } + + if (pkt->mask & RXE_WRITE_MASK) { + if (resid > mtu) { + if (pktlen != mtu || bth_pad(pkt)) { + state = RESPST_ERR_LENGTH; + goto err2; + } + + resid = mtu; + } else { + if (pktlen != resid) { + state = RESPST_ERR_LENGTH; + goto err2; + } + if ((bth_pad(pkt) != (0x3 & (-resid)))) { + /* This case may not be exactly that + * but nothing else fits. + */ + state = RESPST_ERR_LENGTH; + goto err2; + } + } + } + + WARN_ON(qp->resp.mr); + + qp->resp.mr = mem; + return RESPST_EXECUTE; + +err2: + rxe_drop_ref(mem); +err1: + return state; +} + +static enum resp_states send_data_in(struct rxe_qp *qp, void *data_addr, + int data_len) +{ + int err; + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + + err = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE, &qp->resp.wqe->dma, + data_addr, data_len, to_mem_obj, NULL); + if (unlikely(err)) + return (err == -ENOSPC) ? RESPST_ERR_LENGTH + : RESPST_ERR_MALFORMED_WQE; + + return RESPST_NONE; +} + +static enum resp_states write_data_in(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + enum resp_states rc = RESPST_NONE; + int err; + int data_len = payload_size(pkt); + + err = rxe_mem_copy(qp->resp.mr, qp->resp.va, payload_addr(pkt), + data_len, to_mem_obj, NULL); + if (err) { + rc = RESPST_ERR_RKEY_VIOLATION; + goto out; + } + + qp->resp.va += data_len; + qp->resp.resid -= data_len; + +out: + return rc; +} + +/* Guarantee atomicity of atomic operations at the machine level. */ +static DEFINE_SPINLOCK(atomic_ops_lock); + +static enum resp_states process_atomic(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + u64 iova = atmeth_va(pkt); + u64 *vaddr; + enum resp_states ret; + struct rxe_mem *mr = qp->resp.mr; + + if (mr->state != RXE_MEM_STATE_VALID) { + ret = RESPST_ERR_RKEY_VIOLATION; + goto out; + } + + vaddr = iova_to_vaddr(mr, iova, sizeof(u64)); + + /* check vaddr is 8 bytes aligned. */ + if (!vaddr || (uintptr_t)vaddr & 7) { + ret = RESPST_ERR_MISALIGNED_ATOMIC; + goto out; + } + + spin_lock_bh(&atomic_ops_lock); + + qp->resp.atomic_orig = *vaddr; + + if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP || + pkt->opcode == IB_OPCODE_RD_COMPARE_SWAP) { + if (*vaddr == atmeth_comp(pkt)) + *vaddr = atmeth_swap_add(pkt); + } else { + *vaddr += atmeth_swap_add(pkt); + } + + spin_unlock_bh(&atomic_ops_lock); + + ret = RESPST_NONE; +out: + return ret; +} + +static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_pkt_info *ack, + int opcode, + int payload, + u32 psn, + u8 syndrome, + u32 *crcp) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct sk_buff *skb; + u32 crc = 0; + u32 *p; + int paylen; + int pad; + int err; + + /* + * allocate packet + */ + pad = (-payload) & 0x3; + paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE; + + skb = rxe->ifc_ops->init_packet(rxe, &qp->pri_av, paylen, ack); + if (!skb) + return NULL; + + ack->qp = qp; + ack->opcode = opcode; + ack->mask = rxe_opcode[opcode].mask; + ack->offset = pkt->offset; + ack->paylen = paylen; + + /* fill in bth using the request packet headers */ + memcpy(ack->hdr, pkt->hdr, pkt->offset + RXE_BTH_BYTES); + + bth_set_opcode(ack, opcode); + bth_set_qpn(ack, qp->attr.dest_qp_num); + bth_set_pad(ack, pad); + bth_set_se(ack, 0); + bth_set_psn(ack, psn); + bth_set_ack(ack, 0); + ack->psn = psn; + + if (ack->mask & RXE_AETH_MASK) { + aeth_set_syn(ack, syndrome); + aeth_set_msn(ack, qp->resp.msn); + } + + if (ack->mask & RXE_ATMACK_MASK) + atmack_set_orig(ack, qp->resp.atomic_orig); + + err = rxe->ifc_ops->prepare(rxe, ack, skb, &crc); + if (err) { + kfree_skb(skb); + return NULL; + } + + if (crcp) { + /* CRC computation will be continued by the caller */ + *crcp = crc; + } else { + p = payload_addr(ack) + payload + bth_pad(ack); + *p = ~crc; + } + + return skb; +} + +/* RDMA read response. If res is not NULL, then we have a current RDMA request + * being processed or replayed. + */ +static enum resp_states read_reply(struct rxe_qp *qp, + struct rxe_pkt_info *req_pkt) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_pkt_info ack_pkt; + struct sk_buff *skb; + int mtu = qp->mtu; + enum resp_states state; + int payload; + int opcode; + int err; + struct resp_res *res = qp->resp.res; + u32 icrc; + u32 *p; + + if (!res) { + /* This is the first time we process that request. Get a + * resource + */ + res = &qp->resp.resources[qp->resp.res_head]; + + free_rd_atomic_resource(qp, res); + rxe_advance_resp_resource(qp); + + res->type = RXE_READ_MASK; + + res->read.va = qp->resp.va; + res->read.va_org = qp->resp.va; + + res->first_psn = req_pkt->psn; + res->last_psn = req_pkt->psn + + (reth_len(req_pkt) + mtu - 1) / + mtu - 1; + res->cur_psn = req_pkt->psn; + + res->read.resid = qp->resp.resid; + res->read.length = qp->resp.resid; + res->read.rkey = qp->resp.rkey; + + /* note res inherits the reference to mr from qp */ + res->read.mr = qp->resp.mr; + qp->resp.mr = NULL; + + qp->resp.res = res; + res->state = rdatm_res_state_new; + } + + if (res->state == rdatm_res_state_new) { + if (res->read.resid <= mtu) + opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY; + else + opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST; + } else { + if (res->read.resid > mtu) + opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE; + else + opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST; + } + + res->state = rdatm_res_state_next; + + payload = min_t(int, res->read.resid, mtu); + + skb = prepare_ack_packet(qp, req_pkt, &ack_pkt, opcode, payload, + res->cur_psn, AETH_ACK_UNLIMITED, &icrc); + if (!skb) + return RESPST_ERR_RNR; + + err = rxe_mem_copy(res->read.mr, res->read.va, payload_addr(&ack_pkt), + payload, from_mem_obj, &icrc); + if (err) + pr_err("Failed copying memory\n"); + + p = payload_addr(&ack_pkt) + payload + bth_pad(&ack_pkt); + *p = ~icrc; + + err = rxe_xmit_packet(rxe, qp, &ack_pkt, skb); + if (err) { + pr_err("Failed sending RDMA reply.\n"); + kfree_skb(skb); + return RESPST_ERR_RNR; + } + + res->read.va += payload; + res->read.resid -= payload; + res->cur_psn = (res->cur_psn + 1) & BTH_PSN_MASK; + + if (res->read.resid > 0) { + state = RESPST_DONE; + } else { + qp->resp.res = NULL; + qp->resp.opcode = -1; + qp->resp.psn = res->cur_psn; + state = RESPST_CLEANUP; + } + + return state; +} + +/* Executes a new request. A retried request never reach that function (send + * and writes are discarded, and reads and atomics are retried elsewhere. + */ +static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt) +{ + enum resp_states err; + + if (pkt->mask & RXE_SEND_MASK) { + if (qp_type(qp) == IB_QPT_UD || + qp_type(qp) == IB_QPT_SMI || + qp_type(qp) == IB_QPT_GSI) { + union rdma_network_hdr hdr; + struct sk_buff *skb = PKT_TO_SKB(pkt); + + memset(&hdr, 0, sizeof(hdr)); + if (skb->protocol == htons(ETH_P_IP)) + memcpy(&hdr.roce4grh, ip_hdr(skb), sizeof(hdr.roce4grh)); + else if (skb->protocol == htons(ETH_P_IPV6)) + memcpy(&hdr.ibgrh, ipv6_hdr(skb), sizeof(hdr.ibgrh)); + + err = send_data_in(qp, &hdr, sizeof(hdr)); + if (err) + return err; + } + err = send_data_in(qp, payload_addr(pkt), payload_size(pkt)); + if (err) + return err; + } else if (pkt->mask & RXE_WRITE_MASK) { + err = write_data_in(qp, pkt); + if (err) + return err; + } else if (pkt->mask & RXE_READ_MASK) { + /* For RDMA Read we can increment the msn now. See C9-148. */ + qp->resp.msn++; + return RESPST_READ_REPLY; + } else if (pkt->mask & RXE_ATOMIC_MASK) { + err = process_atomic(qp, pkt); + if (err) + return err; + } else + /* Unreachable */ + WARN_ON(1); + + /* We successfully processed this new request. */ + qp->resp.msn++; + + /* next expected psn, read handles this separately */ + qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + + qp->resp.opcode = pkt->opcode; + qp->resp.status = IB_WC_SUCCESS; + + if (pkt->mask & RXE_COMP_MASK) + return RESPST_COMPLETE; + else if (qp_type(qp) == IB_QPT_RC) + return RESPST_ACKNOWLEDGE; + else + return RESPST_CLEANUP; +} + +static enum resp_states do_complete(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct rxe_cqe cqe; + struct ib_wc *wc = &cqe.ibwc; + struct ib_uverbs_wc *uwc = &cqe.uibwc; + struct rxe_recv_wqe *wqe = qp->resp.wqe; + + if (unlikely(!wqe)) + return RESPST_CLEANUP; + + memset(&cqe, 0, sizeof(cqe)); + + wc->wr_id = wqe->wr_id; + wc->status = qp->resp.status; + wc->qp = &qp->ibqp; + + /* fields after status are not required for errors */ + if (wc->status == IB_WC_SUCCESS) { + wc->opcode = (pkt->mask & RXE_IMMDT_MASK && + pkt->mask & RXE_WRITE_MASK) ? + IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV; + wc->vendor_err = 0; + wc->byte_len = wqe->dma.length - wqe->dma.resid; + + /* fields after byte_len are different between kernel and user + * space + */ + if (qp->rcq->is_user) { + uwc->wc_flags = IB_WC_GRH; + + if (pkt->mask & RXE_IMMDT_MASK) { + uwc->wc_flags |= IB_WC_WITH_IMM; + uwc->ex.imm_data = + (__u32 __force)immdt_imm(pkt); + } + + if (pkt->mask & RXE_IETH_MASK) { + uwc->wc_flags |= IB_WC_WITH_INVALIDATE; + uwc->ex.invalidate_rkey = ieth_rkey(pkt); + } + + uwc->qp_num = qp->ibqp.qp_num; + + if (pkt->mask & RXE_DETH_MASK) + uwc->src_qp = deth_sqp(pkt); + + uwc->port_num = qp->attr.port_num; + } else { + struct sk_buff *skb = PKT_TO_SKB(pkt); + + wc->wc_flags = IB_WC_GRH | IB_WC_WITH_NETWORK_HDR_TYPE; + if (skb->protocol == htons(ETH_P_IP)) + wc->network_hdr_type = RDMA_NETWORK_IPV4; + else + wc->network_hdr_type = RDMA_NETWORK_IPV6; + + if (pkt->mask & RXE_IMMDT_MASK) { + wc->wc_flags |= IB_WC_WITH_IMM; + wc->ex.imm_data = immdt_imm(pkt); + } + + if (pkt->mask & RXE_IETH_MASK) { + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_mem *rmr; + + wc->wc_flags |= IB_WC_WITH_INVALIDATE; + wc->ex.invalidate_rkey = ieth_rkey(pkt); + + rmr = rxe_pool_get_index(&rxe->mr_pool, + wc->ex.invalidate_rkey >> 8); + if (unlikely(!rmr)) { + pr_err("Bad rkey %#x invalidation\n", wc->ex.invalidate_rkey); + return RESPST_ERROR; + } + rmr->state = RXE_MEM_STATE_FREE; + } + + wc->qp = &qp->ibqp; + + if (pkt->mask & RXE_DETH_MASK) + wc->src_qp = deth_sqp(pkt); + + wc->port_num = qp->attr.port_num; + } + } + + /* have copy for srq and reference for !srq */ + if (!qp->srq) + advance_consumer(qp->rq.queue); + + qp->resp.wqe = NULL; + + if (rxe_cq_post(qp->rcq, &cqe, pkt ? bth_se(pkt) : 1)) + return RESPST_ERR_CQ_OVERFLOW; + + if (qp->resp.state == QP_STATE_ERROR) + return RESPST_CHK_RESOURCE; + + if (!pkt) + return RESPST_DONE; + else if (qp_type(qp) == IB_QPT_RC) + return RESPST_ACKNOWLEDGE; + else + return RESPST_CLEANUP; +} + +static int send_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt, + u8 syndrome, u32 psn) +{ + int err = 0; + struct rxe_pkt_info ack_pkt; + struct sk_buff *skb; + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + + skb = prepare_ack_packet(qp, pkt, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE, + 0, psn, syndrome, NULL); + if (!skb) { + err = -ENOMEM; + goto err1; + } + + err = rxe_xmit_packet(rxe, qp, &ack_pkt, skb); + if (err) { + pr_err_ratelimited("Failed sending ack\n"); + kfree_skb(skb); + } + +err1: + return err; +} + +static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt, + u8 syndrome) +{ + int rc = 0; + struct rxe_pkt_info ack_pkt; + struct sk_buff *skb; + struct sk_buff *skb_copy; + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct resp_res *res; + + skb = prepare_ack_packet(qp, pkt, &ack_pkt, + IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, 0, pkt->psn, + syndrome, NULL); + if (!skb) { + rc = -ENOMEM; + goto out; + } + + skb_copy = skb_clone(skb, GFP_ATOMIC); + if (skb_copy) + rxe_add_ref(qp); /* for the new SKB */ + else { + pr_warn("Could not clone atomic response\n"); + rc = -ENOMEM; + goto out; + } + + res = &qp->resp.resources[qp->resp.res_head]; + free_rd_atomic_resource(qp, res); + rxe_advance_resp_resource(qp); + + res->type = RXE_ATOMIC_MASK; + res->atomic.skb = skb; + res->first_psn = qp->resp.psn; + res->last_psn = qp->resp.psn; + res->cur_psn = qp->resp.psn; + + rc = rxe_xmit_packet(rxe, qp, &ack_pkt, skb_copy); + if (rc) { + pr_err_ratelimited("Failed sending ack\n"); + rxe_drop_ref(qp); + kfree_skb(skb_copy); + } + +out: + return rc; +} + +static enum resp_states acknowledge(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + if (qp_type(qp) != IB_QPT_RC) + return RESPST_CLEANUP; + + if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED) + send_ack(qp, pkt, qp->resp.aeth_syndrome, pkt->psn); + else if (pkt->mask & RXE_ATOMIC_MASK) + send_atomic_ack(qp, pkt, AETH_ACK_UNLIMITED); + else if (bth_ack(pkt)) + send_ack(qp, pkt, AETH_ACK_UNLIMITED, pkt->psn); + + return RESPST_CLEANUP; +} + +static enum resp_states cleanup(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct sk_buff *skb; + + if (pkt) { + skb = skb_dequeue(&qp->req_pkts); + rxe_drop_ref(qp); + kfree_skb(skb); + } + + if (qp->resp.mr) { + rxe_drop_ref(qp->resp.mr); + qp->resp.mr = NULL; + } + + return RESPST_DONE; +} + +static struct resp_res *find_resource(struct rxe_qp *qp, u32 psn) +{ + int i; + + for (i = 0; i < qp->attr.max_rd_atomic; i++) { + struct resp_res *res = &qp->resp.resources[i]; + + if (res->type == 0) + continue; + + if (psn_compare(psn, res->first_psn) >= 0 && + psn_compare(psn, res->last_psn) <= 0) { + return res; + } + } + + return NULL; +} + +static enum resp_states duplicate_request(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + enum resp_states rc; + + if (pkt->mask & RXE_SEND_MASK || + pkt->mask & RXE_WRITE_MASK) { + /* SEND. Ack again and cleanup. C9-105. */ + if (bth_ack(pkt)) + send_ack(qp, pkt, AETH_ACK_UNLIMITED, qp->resp.psn - 1); + rc = RESPST_CLEANUP; + goto out; + } else if (pkt->mask & RXE_READ_MASK) { + struct resp_res *res; + + res = find_resource(qp, pkt->psn); + if (!res) { + /* Resource not found. Class D error. Drop the + * request. + */ + rc = RESPST_CLEANUP; + goto out; + } else { + /* Ensure this new request is the same as the previous + * one or a subset of it. + */ + u64 iova = reth_va(pkt); + u32 resid = reth_len(pkt); + + if (iova < res->read.va_org || + resid > res->read.length || + (iova + resid) > (res->read.va_org + + res->read.length)) { + rc = RESPST_CLEANUP; + goto out; + } + + if (reth_rkey(pkt) != res->read.rkey) { + rc = RESPST_CLEANUP; + goto out; + } + + res->cur_psn = pkt->psn; + res->state = (pkt->psn == res->first_psn) ? + rdatm_res_state_new : + rdatm_res_state_replay; + + /* Reset the resource, except length. */ + res->read.va_org = iova; + res->read.va = iova; + res->read.resid = resid; + + /* Replay the RDMA read reply. */ + qp->resp.res = res; + rc = RESPST_READ_REPLY; + goto out; + } + } else { + struct resp_res *res; + + /* Find the operation in our list of responder resources. */ + res = find_resource(qp, pkt->psn); + if (res) { + struct sk_buff *skb_copy; + + skb_copy = skb_clone(res->atomic.skb, GFP_ATOMIC); + if (skb_copy) { + rxe_add_ref(qp); /* for the new SKB */ + } else { + pr_warn("Couldn't clone atomic resp\n"); + rc = RESPST_CLEANUP; + goto out; + } + bth_set_psn(SKB_TO_PKT(skb_copy), + qp->resp.psn - 1); + /* Resend the result. */ + rc = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp, + pkt, skb_copy); + if (rc) { + pr_err("Failed resending result. This flow is not handled - skb ignored\n"); + kfree_skb(skb_copy); + rc = RESPST_CLEANUP; + goto out; + } + } + + /* Resource not found. Class D error. Drop the request. */ + rc = RESPST_CLEANUP; + goto out; + } +out: + return rc; +} + +/* Process a class A or C. Both are treated the same in this implementation. */ +static void do_class_ac_error(struct rxe_qp *qp, u8 syndrome, + enum ib_wc_status status) +{ + qp->resp.aeth_syndrome = syndrome; + qp->resp.status = status; + + /* indicate that we should go through the ERROR state */ + qp->resp.goto_error = 1; +} + +static enum resp_states do_class_d1e_error(struct rxe_qp *qp) +{ + /* UC */ + if (qp->srq) { + /* Class E */ + qp->resp.drop_msg = 1; + if (qp->resp.wqe) { + qp->resp.status = IB_WC_REM_INV_REQ_ERR; + return RESPST_COMPLETE; + } else { + return RESPST_CLEANUP; + } + } else { + /* Class D1. This packet may be the start of a + * new message and could be valid. The previous + * message is invalid and ignored. reset the + * recv wr to its original state + */ + if (qp->resp.wqe) { + qp->resp.wqe->dma.resid = qp->resp.wqe->dma.length; + qp->resp.wqe->dma.cur_sge = 0; + qp->resp.wqe->dma.sge_offset = 0; + qp->resp.opcode = -1; + } + + if (qp->resp.mr) { + rxe_drop_ref(qp->resp.mr); + qp->resp.mr = NULL; + } + + return RESPST_CLEANUP; + } +} + +int rxe_responder(void *arg) +{ + struct rxe_qp *qp = (struct rxe_qp *)arg; + enum resp_states state; + struct rxe_pkt_info *pkt = NULL; + int ret = 0; + + qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED; + + if (!qp->valid) { + ret = -EINVAL; + goto done; + } + + switch (qp->resp.state) { + case QP_STATE_RESET: + state = RESPST_RESET; + break; + + default: + state = RESPST_GET_REQ; + break; + } + + while (1) { + pr_debug("state = %s\n", resp_state_name[state]); + switch (state) { + case RESPST_GET_REQ: + state = get_req(qp, &pkt); + break; + case RESPST_CHK_PSN: + state = check_psn(qp, pkt); + break; + case RESPST_CHK_OP_SEQ: + state = check_op_seq(qp, pkt); + break; + case RESPST_CHK_OP_VALID: + state = check_op_valid(qp, pkt); + break; + case RESPST_CHK_RESOURCE: + state = check_resource(qp, pkt); + break; + case RESPST_CHK_LENGTH: + state = check_length(qp, pkt); + break; + case RESPST_CHK_RKEY: + state = check_rkey(qp, pkt); + break; + case RESPST_EXECUTE: + state = execute(qp, pkt); + break; + case RESPST_COMPLETE: + state = do_complete(qp, pkt); + break; + case RESPST_READ_REPLY: + state = read_reply(qp, pkt); + break; + case RESPST_ACKNOWLEDGE: + state = acknowledge(qp, pkt); + break; + case RESPST_CLEANUP: + state = cleanup(qp, pkt); + break; + case RESPST_DUPLICATE_REQUEST: + state = duplicate_request(qp, pkt); + break; + case RESPST_ERR_PSN_OUT_OF_SEQ: + /* RC only - Class B. Drop packet. */ + send_ack(qp, pkt, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn); + state = RESPST_CLEANUP; + break; + + case RESPST_ERR_TOO_MANY_RDMA_ATM_REQ: + case RESPST_ERR_MISSING_OPCODE_FIRST: + case RESPST_ERR_MISSING_OPCODE_LAST_C: + case RESPST_ERR_UNSUPPORTED_OPCODE: + case RESPST_ERR_MISALIGNED_ATOMIC: + /* RC Only - Class C. */ + do_class_ac_error(qp, AETH_NAK_INVALID_REQ, + IB_WC_REM_INV_REQ_ERR); + state = RESPST_COMPLETE; + break; + + case RESPST_ERR_MISSING_OPCODE_LAST_D1E: + state = do_class_d1e_error(qp); + break; + case RESPST_ERR_RNR: + if (qp_type(qp) == IB_QPT_RC) { + /* RC - class B */ + send_ack(qp, pkt, AETH_RNR_NAK | + (~AETH_TYPE_MASK & + qp->attr.min_rnr_timer), + pkt->psn); + } else { + /* UD/UC - class D */ + qp->resp.drop_msg = 1; + } + state = RESPST_CLEANUP; + break; + + case RESPST_ERR_RKEY_VIOLATION: + if (qp_type(qp) == IB_QPT_RC) { + /* Class C */ + do_class_ac_error(qp, AETH_NAK_REM_ACC_ERR, + IB_WC_REM_ACCESS_ERR); + state = RESPST_COMPLETE; + } else { + qp->resp.drop_msg = 1; + if (qp->srq) { + /* UC/SRQ Class D */ + qp->resp.status = IB_WC_REM_ACCESS_ERR; + state = RESPST_COMPLETE; + } else { + /* UC/non-SRQ Class E. */ + state = RESPST_CLEANUP; + } + } + break; + + case RESPST_ERR_LENGTH: + if (qp_type(qp) == IB_QPT_RC) { + /* Class C */ + do_class_ac_error(qp, AETH_NAK_INVALID_REQ, + IB_WC_REM_INV_REQ_ERR); + state = RESPST_COMPLETE; + } else if (qp->srq) { + /* UC/UD - class E */ + qp->resp.status = IB_WC_REM_INV_REQ_ERR; + state = RESPST_COMPLETE; + } else { + /* UC/UD - class D */ + qp->resp.drop_msg = 1; + state = RESPST_CLEANUP; + } + break; + + case RESPST_ERR_MALFORMED_WQE: + /* All, Class A. */ + do_class_ac_error(qp, AETH_NAK_REM_OP_ERR, + IB_WC_LOC_QP_OP_ERR); + state = RESPST_COMPLETE; + break; + + case RESPST_ERR_CQ_OVERFLOW: + /* All - Class G */ + state = RESPST_ERROR; + break; + + case RESPST_DONE: + if (qp->resp.goto_error) { + state = RESPST_ERROR; + break; + } + + goto done; + + case RESPST_EXIT: + if (qp->resp.goto_error) { + state = RESPST_ERROR; + break; + } + + goto exit; + + case RESPST_RESET: { + struct sk_buff *skb; + + while ((skb = skb_dequeue(&qp->req_pkts))) { + rxe_drop_ref(qp); + kfree_skb(skb); + } + + while (!qp->srq && qp->rq.queue && + queue_head(qp->rq.queue)) + advance_consumer(qp->rq.queue); + + qp->resp.wqe = NULL; + goto exit; + } + + case RESPST_ERROR: + qp->resp.goto_error = 0; + pr_warn("qp#%d moved to error state\n", qp_num(qp)); + rxe_qp_error(qp); + goto exit; + + default: + WARN_ON(1); + } + } + +exit: + ret = -EAGAIN; +done: + return ret; +} diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c new file mode 100644 index 000000000000..2a6e3cd2d4e8 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_srq.c @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_attr *attr, enum ib_srq_attr_mask mask) +{ + if (srq && srq->error) { + pr_warn("srq in error state\n"); + goto err1; + } + + if (mask & IB_SRQ_MAX_WR) { + if (attr->max_wr > rxe->attr.max_srq_wr) { + pr_warn("max_wr(%d) > max_srq_wr(%d)\n", + attr->max_wr, rxe->attr.max_srq_wr); + goto err1; + } + + if (attr->max_wr <= 0) { + pr_warn("max_wr(%d) <= 0\n", attr->max_wr); + goto err1; + } + + if (srq && srq->limit && (attr->max_wr < srq->limit)) { + pr_warn("max_wr (%d) < srq->limit (%d)\n", + attr->max_wr, srq->limit); + goto err1; + } + + if (attr->max_wr < RXE_MIN_SRQ_WR) + attr->max_wr = RXE_MIN_SRQ_WR; + } + + if (mask & IB_SRQ_LIMIT) { + if (attr->srq_limit > rxe->attr.max_srq_wr) { + pr_warn("srq_limit(%d) > max_srq_wr(%d)\n", + attr->srq_limit, rxe->attr.max_srq_wr); + goto err1; + } + + if (srq && (attr->srq_limit > srq->rq.queue->buf->index_mask)) { + pr_warn("srq_limit (%d) > cur limit(%d)\n", + attr->srq_limit, + srq->rq.queue->buf->index_mask); + goto err1; + } + } + + if (mask == IB_SRQ_INIT_MASK) { + if (attr->max_sge > rxe->attr.max_srq_sge) { + pr_warn("max_sge(%d) > max_srq_sge(%d)\n", + attr->max_sge, rxe->attr.max_srq_sge); + goto err1; + } + + if (attr->max_sge < RXE_MIN_SRQ_SGE) + attr->max_sge = RXE_MIN_SRQ_SGE; + } + + return 0; + +err1: + return -EINVAL; +} + +int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_init_attr *init, + struct ib_ucontext *context, struct ib_udata *udata) +{ + int err; + int srq_wqe_size; + struct rxe_queue *q; + + srq->ibsrq.event_handler = init->event_handler; + srq->ibsrq.srq_context = init->srq_context; + srq->limit = init->attr.srq_limit; + srq->srq_num = srq->pelem.index; + srq->rq.max_wr = init->attr.max_wr; + srq->rq.max_sge = init->attr.max_sge; + + srq_wqe_size = rcv_wqe_size(srq->rq.max_sge); + + spin_lock_init(&srq->rq.producer_lock); + spin_lock_init(&srq->rq.consumer_lock); + + q = rxe_queue_init(rxe, &srq->rq.max_wr, + srq_wqe_size); + if (!q) { + pr_warn("unable to allocate queue for srq\n"); + return -ENOMEM; + } + + srq->rq.queue = q; + + err = do_mmap_info(rxe, udata, false, context, q->buf, + q->buf_size, &q->ip); + if (err) + return err; + + if (udata && udata->outlen >= sizeof(struct mminfo) + sizeof(u32)) { + if (copy_to_user(udata->outbuf + sizeof(struct mminfo), + &srq->srq_num, sizeof(u32))) + return -EFAULT; + } + return 0; +} + +int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, + struct ib_udata *udata) +{ + int err; + struct rxe_queue *q = srq->rq.queue; + struct mminfo mi = { .offset = 1, .size = 0}; + + if (mask & IB_SRQ_MAX_WR) { + /* Check that we can write the mminfo struct to user space */ + if (udata && udata->inlen >= sizeof(__u64)) { + __u64 mi_addr; + + /* Get address of user space mminfo struct */ + err = ib_copy_from_udata(&mi_addr, udata, + sizeof(mi_addr)); + if (err) + goto err1; + + udata->outbuf = (void __user *)(unsigned long)mi_addr; + udata->outlen = sizeof(mi); + + if (!access_ok(VERIFY_WRITE, + (void __user *)udata->outbuf, + udata->outlen)) { + err = -EFAULT; + goto err1; + } + } + + err = rxe_queue_resize(q, (unsigned int *)&attr->max_wr, + rcv_wqe_size(srq->rq.max_sge), + srq->rq.queue->ip ? + srq->rq.queue->ip->context : + NULL, + udata, &srq->rq.producer_lock, + &srq->rq.consumer_lock); + if (err) + goto err2; + } + + if (mask & IB_SRQ_LIMIT) + srq->limit = attr->srq_limit; + + return 0; + +err2: + rxe_queue_cleanup(q); + srq->rq.queue = NULL; +err1: + return err; +} diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c new file mode 100644 index 000000000000..cf8e77800046 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_net.h" + +/* Copy argument and remove trailing CR. Return the new length. */ +static int sanitize_arg(const char *val, char *intf, int intf_len) +{ + int len; + + if (!val) + return 0; + + /* Remove newline. */ + for (len = 0; len < intf_len - 1 && val[len] && val[len] != '\n'; len++) + intf[len] = val[len]; + intf[len] = 0; + + if (len == 0 || (val[len] != 0 && val[len] != '\n')) + return 0; + + return len; +} + +static void rxe_set_port_state(struct net_device *ndev) +{ + struct rxe_dev *rxe = net_to_rxe(ndev); + bool is_up = netif_running(ndev) && netif_carrier_ok(ndev); + + if (!rxe) + goto out; + + if (is_up) + rxe_port_up(rxe); + else + rxe_port_down(rxe); /* down for unknown state */ +out: + return; +} + +static int rxe_param_set_add(const char *val, const struct kernel_param *kp) +{ + int len; + int err = 0; + char intf[32]; + struct net_device *ndev = NULL; + struct rxe_dev *rxe; + + len = sanitize_arg(val, intf, sizeof(intf)); + if (!len) { + pr_err("rxe: add: invalid interface name\n"); + err = -EINVAL; + goto err; + } + + ndev = dev_get_by_name(&init_net, intf); + if (!ndev) { + pr_err("interface %s not found\n", intf); + err = -EINVAL; + goto err; + } + + if (net_to_rxe(ndev)) { + pr_err("rxe: already configured on %s\n", intf); + err = -EINVAL; + goto err; + } + + rxe = rxe_net_add(ndev); + if (!rxe) { + pr_err("rxe: failed to add %s\n", intf); + err = -EINVAL; + goto err; + } + + rxe_set_port_state(ndev); + pr_info("rxe: added %s to %s\n", rxe->ib_dev.name, intf); +err: + if (ndev) + dev_put(ndev); + return err; +} + +static int rxe_param_set_remove(const char *val, const struct kernel_param *kp) +{ + int len; + char intf[32]; + struct rxe_dev *rxe; + + len = sanitize_arg(val, intf, sizeof(intf)); + if (!len) { + pr_err("rxe: add: invalid interface name\n"); + return -EINVAL; + } + + if (strncmp("all", intf, len) == 0) { + pr_info("rxe_sys: remove all"); + rxe_remove_all(); + return 0; + } + + rxe = get_rxe_by_name(intf); + + if (!rxe) { + pr_err("rxe: not configured on %s\n", intf); + return -EINVAL; + } + + list_del(&rxe->list); + rxe_remove(rxe); + + return 0; +} + +static const struct kernel_param_ops rxe_add_ops = { + .set = rxe_param_set_add, +}; + +static const struct kernel_param_ops rxe_remove_ops = { + .set = rxe_param_set_remove, +}; + +module_param_cb(add, &rxe_add_ops, NULL, 0200); +MODULE_PARM_DESC(add, "Create RXE device over network interface"); +module_param_cb(remove, &rxe_remove_ops, NULL, 0200); +MODULE_PARM_DESC(remove, "Remove RXE device over network interface"); diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c new file mode 100644 index 000000000000..1e19bf828a6e --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/hardirq.h> + +#include "rxe_task.h" + +int __rxe_do_task(struct rxe_task *task) + +{ + int ret; + + while ((ret = task->func(task->arg)) == 0) + ; + + task->ret = ret; + + return ret; +} + +/* + * this locking is due to a potential race where + * a second caller finds the task already running + * but looks just after the last call to func + */ +void rxe_do_task(unsigned long data) +{ + int cont; + int ret; + unsigned long flags; + struct rxe_task *task = (struct rxe_task *)data; + + spin_lock_irqsave(&task->state_lock, flags); + switch (task->state) { + case TASK_STATE_START: + task->state = TASK_STATE_BUSY; + spin_unlock_irqrestore(&task->state_lock, flags); + break; + + case TASK_STATE_BUSY: + task->state = TASK_STATE_ARMED; + /* fall through to */ + case TASK_STATE_ARMED: + spin_unlock_irqrestore(&task->state_lock, flags); + return; + + default: + spin_unlock_irqrestore(&task->state_lock, flags); + pr_warn("bad state = %d in rxe_do_task\n", task->state); + return; + } + + do { + cont = 0; + ret = task->func(task->arg); + + spin_lock_irqsave(&task->state_lock, flags); + switch (task->state) { + case TASK_STATE_BUSY: + if (ret) + task->state = TASK_STATE_START; + else + cont = 1; + break; + + /* soneone tried to run the task since the last time we called + * func, so we will call one more time regardless of the + * return value + */ + case TASK_STATE_ARMED: + task->state = TASK_STATE_BUSY; + cont = 1; + break; + + default: + pr_warn("bad state = %d in rxe_do_task\n", + task->state); + } + spin_unlock_irqrestore(&task->state_lock, flags); + } while (cont); + + task->ret = ret; +} + +int rxe_init_task(void *obj, struct rxe_task *task, + void *arg, int (*func)(void *), char *name) +{ + task->obj = obj; + task->arg = arg; + task->func = func; + snprintf(task->name, sizeof(task->name), "%s", name); + + tasklet_init(&task->tasklet, rxe_do_task, (unsigned long)task); + + task->state = TASK_STATE_START; + spin_lock_init(&task->state_lock); + + return 0; +} + +void rxe_cleanup_task(struct rxe_task *task) +{ + tasklet_kill(&task->tasklet); +} + +void rxe_run_task(struct rxe_task *task, int sched) +{ + if (sched) + tasklet_schedule(&task->tasklet); + else + rxe_do_task((unsigned long)task); +} + +void rxe_disable_task(struct rxe_task *task) +{ + tasklet_disable(&task->tasklet); +} + +void rxe_enable_task(struct rxe_task *task) +{ + tasklet_enable(&task->tasklet); +} diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h new file mode 100644 index 000000000000..d14aa6daed05 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_TASK_H +#define RXE_TASK_H + +enum { + TASK_STATE_START = 0, + TASK_STATE_BUSY = 1, + TASK_STATE_ARMED = 2, +}; + +/* + * data structure to describe a 'task' which is a short + * function that returns 0 as long as it needs to be + * called again. + */ +struct rxe_task { + void *obj; + struct tasklet_struct tasklet; + int state; + spinlock_t state_lock; /* spinlock for task state */ + void *arg; + int (*func)(void *arg); + int ret; + char name[16]; +}; + +/* + * init rxe_task structure + * arg => parameter to pass to fcn + * fcn => function to call until it returns != 0 + */ +int rxe_init_task(void *obj, struct rxe_task *task, + void *arg, int (*func)(void *), char *name); + +/* cleanup task */ +void rxe_cleanup_task(struct rxe_task *task); + +/* + * raw call to func in loop without any checking + * can call when tasklets are disabled + */ +int __rxe_do_task(struct rxe_task *task); + +/* + * common function called by any of the main tasklets + * If there is any chance that there is additional + * work to do someone must reschedule the task before + * leaving + */ +void rxe_do_task(unsigned long data); + +/* run a task, else schedule it to run as a tasklet, The decision + * to run or schedule tasklet is based on the parameter sched. + */ +void rxe_run_task(struct rxe_task *task, int sched); + +/* keep a task from scheduling */ +void rxe_disable_task(struct rxe_task *task); + +/* allow task to run */ +void rxe_enable_task(struct rxe_task *task); + +#endif /* RXE_TASK_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c new file mode 100644 index 000000000000..4552be960c6a --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -0,0 +1,1330 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +static int rxe_query_device(struct ib_device *dev, + struct ib_device_attr *attr, + struct ib_udata *uhw) +{ + struct rxe_dev *rxe = to_rdev(dev); + + if (uhw->inlen || uhw->outlen) + return -EINVAL; + + *attr = rxe->attr; + return 0; +} + +static void rxe_eth_speed_to_ib_speed(int speed, u8 *active_speed, + u8 *active_width) +{ + if (speed <= 1000) { + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_SDR; + } else if (speed <= 10000) { + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_FDR10; + } else if (speed <= 20000) { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_DDR; + } else if (speed <= 30000) { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_QDR; + } else if (speed <= 40000) { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_FDR10; + } else { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_EDR; + } +} + +static int rxe_query_port(struct ib_device *dev, + u8 port_num, struct ib_port_attr *attr) +{ + struct rxe_dev *rxe = to_rdev(dev); + struct rxe_port *port; + u32 speed; + + if (unlikely(port_num != 1)) { + pr_warn("invalid port_number %d\n", port_num); + goto err1; + } + + port = &rxe->port; + + *attr = port->attr; + + mutex_lock(&rxe->usdev_lock); + if (rxe->ndev->ethtool_ops->get_link_ksettings) { + struct ethtool_link_ksettings ks; + + rxe->ndev->ethtool_ops->get_link_ksettings(rxe->ndev, &ks); + speed = ks.base.speed; + } else if (rxe->ndev->ethtool_ops->get_settings) { + struct ethtool_cmd cmd; + + rxe->ndev->ethtool_ops->get_settings(rxe->ndev, &cmd); + speed = cmd.speed; + } else { + pr_warn("%s speed is unknown, defaulting to 1000\n", rxe->ndev->name); + speed = 1000; + } + rxe_eth_speed_to_ib_speed(speed, &attr->active_speed, &attr->active_width); + mutex_unlock(&rxe->usdev_lock); + + return 0; + +err1: + return -EINVAL; +} + +static int rxe_query_gid(struct ib_device *device, + u8 port_num, int index, union ib_gid *gid) +{ + int ret; + + if (index > RXE_PORT_GID_TBL_LEN) + return -EINVAL; + + ret = ib_get_cached_gid(device, port_num, index, gid, NULL); + if (ret == -EAGAIN) { + memcpy(gid, &zgid, sizeof(*gid)); + return 0; + } + + return ret; +} + +static int rxe_add_gid(struct ib_device *device, u8 port_num, unsigned int + index, const union ib_gid *gid, + const struct ib_gid_attr *attr, void **context) +{ + if (index >= RXE_PORT_GID_TBL_LEN) + return -EINVAL; + return 0; +} + +static int rxe_del_gid(struct ib_device *device, u8 port_num, unsigned int + index, void **context) +{ + if (index >= RXE_PORT_GID_TBL_LEN) + return -EINVAL; + return 0; +} + +static struct net_device *rxe_get_netdev(struct ib_device *device, + u8 port_num) +{ + struct rxe_dev *rxe = to_rdev(device); + + if (rxe->ndev) { + dev_hold(rxe->ndev); + return rxe->ndev; + } + + return NULL; +} + +static int rxe_query_pkey(struct ib_device *device, + u8 port_num, u16 index, u16 *pkey) +{ + struct rxe_dev *rxe = to_rdev(device); + struct rxe_port *port; + + if (unlikely(port_num != 1)) { + dev_warn(device->dma_device, "invalid port_num = %d\n", + port_num); + goto err1; + } + + port = &rxe->port; + + if (unlikely(index >= port->attr.pkey_tbl_len)) { + dev_warn(device->dma_device, "invalid index = %d\n", + index); + goto err1; + } + + *pkey = port->pkey_tbl[index]; + return 0; + +err1: + return -EINVAL; +} + +static int rxe_modify_device(struct ib_device *dev, + int mask, struct ib_device_modify *attr) +{ + struct rxe_dev *rxe = to_rdev(dev); + + if (mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) + rxe->attr.sys_image_guid = cpu_to_be64(attr->sys_image_guid); + + if (mask & IB_DEVICE_MODIFY_NODE_DESC) { + memcpy(rxe->ib_dev.node_desc, + attr->node_desc, sizeof(rxe->ib_dev.node_desc)); + } + + return 0; +} + +static int rxe_modify_port(struct ib_device *dev, + u8 port_num, int mask, struct ib_port_modify *attr) +{ + struct rxe_dev *rxe = to_rdev(dev); + struct rxe_port *port; + + if (unlikely(port_num != 1)) { + pr_warn("invalid port_num = %d\n", port_num); + goto err1; + } + + port = &rxe->port; + + port->attr.port_cap_flags |= attr->set_port_cap_mask; + port->attr.port_cap_flags &= ~attr->clr_port_cap_mask; + + if (mask & IB_PORT_RESET_QKEY_CNTR) + port->attr.qkey_viol_cntr = 0; + + return 0; + +err1: + return -EINVAL; +} + +static enum rdma_link_layer rxe_get_link_layer(struct ib_device *dev, + u8 port_num) +{ + struct rxe_dev *rxe = to_rdev(dev); + + return rxe->ifc_ops->link_layer(rxe, port_num); +} + +static struct ib_ucontext *rxe_alloc_ucontext(struct ib_device *dev, + struct ib_udata *udata) +{ + struct rxe_dev *rxe = to_rdev(dev); + struct rxe_ucontext *uc; + + uc = rxe_alloc(&rxe->uc_pool); + return uc ? &uc->ibuc : ERR_PTR(-ENOMEM); +} + +static int rxe_dealloc_ucontext(struct ib_ucontext *ibuc) +{ + struct rxe_ucontext *uc = to_ruc(ibuc); + + rxe_drop_ref(uc); + return 0; +} + +static int rxe_port_immutable(struct ib_device *dev, u8 port_num, + struct ib_port_immutable *immutable) +{ + int err; + struct ib_port_attr attr; + + err = rxe_query_port(dev, port_num, &attr); + if (err) + return err; + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + + return 0; +} + +static struct ib_pd *rxe_alloc_pd(struct ib_device *dev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct rxe_dev *rxe = to_rdev(dev); + struct rxe_pd *pd; + + pd = rxe_alloc(&rxe->pd_pool); + return pd ? &pd->ibpd : ERR_PTR(-ENOMEM); +} + +static int rxe_dealloc_pd(struct ib_pd *ibpd) +{ + struct rxe_pd *pd = to_rpd(ibpd); + + rxe_drop_ref(pd); + return 0; +} + +static int rxe_init_av(struct rxe_dev *rxe, struct ib_ah_attr *attr, + struct rxe_av *av) +{ + int err; + union ib_gid sgid; + struct ib_gid_attr sgid_attr; + + err = ib_get_cached_gid(&rxe->ib_dev, attr->port_num, + attr->grh.sgid_index, &sgid, + &sgid_attr); + if (err) { + pr_err("Failed to query sgid. err = %d\n", err); + return err; + } + + err = rxe_av_from_attr(rxe, attr->port_num, av, attr); + if (!err) + err = rxe_av_fill_ip_info(rxe, av, attr, &sgid_attr, &sgid); + + if (sgid_attr.ndev) + dev_put(sgid_attr.ndev); + return err; +} + +static struct ib_ah *rxe_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_ah *ah; + + err = rxe_av_chk_attr(rxe, attr); + if (err) + goto err1; + + ah = rxe_alloc(&rxe->ah_pool); + if (!ah) { + err = -ENOMEM; + goto err1; + } + + rxe_add_ref(pd); + ah->pd = pd; + + err = rxe_init_av(rxe, attr, &ah->av); + if (err) + goto err2; + + return &ah->ibah; + +err2: + rxe_drop_ref(pd); + rxe_drop_ref(ah); +err1: + return ERR_PTR(err); +} + +static int rxe_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *attr) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibah->device); + struct rxe_ah *ah = to_rah(ibah); + + err = rxe_av_chk_attr(rxe, attr); + if (err) + return err; + + err = rxe_init_av(rxe, attr, &ah->av); + if (err) + return err; + + return 0; +} + +static int rxe_query_ah(struct ib_ah *ibah, struct ib_ah_attr *attr) +{ + struct rxe_dev *rxe = to_rdev(ibah->device); + struct rxe_ah *ah = to_rah(ibah); + + rxe_av_to_attr(rxe, &ah->av, attr); + return 0; +} + +static int rxe_destroy_ah(struct ib_ah *ibah) +{ + struct rxe_ah *ah = to_rah(ibah); + + rxe_drop_ref(ah->pd); + rxe_drop_ref(ah); + return 0; +} + +static int post_one_recv(struct rxe_rq *rq, struct ib_recv_wr *ibwr) +{ + int err; + int i; + u32 length; + struct rxe_recv_wqe *recv_wqe; + int num_sge = ibwr->num_sge; + + if (unlikely(queue_full(rq->queue))) { + err = -ENOMEM; + goto err1; + } + + if (unlikely(num_sge > rq->max_sge)) { + err = -EINVAL; + goto err1; + } + + length = 0; + for (i = 0; i < num_sge; i++) + length += ibwr->sg_list[i].length; + + recv_wqe = producer_addr(rq->queue); + recv_wqe->wr_id = ibwr->wr_id; + recv_wqe->num_sge = num_sge; + + memcpy(recv_wqe->dma.sge, ibwr->sg_list, + num_sge * sizeof(struct ib_sge)); + + recv_wqe->dma.length = length; + recv_wqe->dma.resid = length; + recv_wqe->dma.num_sge = num_sge; + recv_wqe->dma.cur_sge = 0; + recv_wqe->dma.sge_offset = 0; + + /* make sure all changes to the work queue are written before we + * update the producer pointer + */ + smp_wmb(); + + advance_producer(rq->queue); + return 0; + +err1: + return err; +} + +static struct ib_srq *rxe_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *init, + struct ib_udata *udata) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_srq *srq; + struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL; + + err = rxe_srq_chk_attr(rxe, NULL, &init->attr, IB_SRQ_INIT_MASK); + if (err) + goto err1; + + srq = rxe_alloc(&rxe->srq_pool); + if (!srq) { + err = -ENOMEM; + goto err1; + } + + rxe_add_index(srq); + rxe_add_ref(pd); + srq->pd = pd; + + err = rxe_srq_from_init(rxe, srq, init, context, udata); + if (err) + goto err2; + + return &srq->ibsrq; + +err2: + rxe_drop_ref(pd); + rxe_drop_index(srq); + rxe_drop_ref(srq); +err1: + return ERR_PTR(err); +} + +static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask mask, + struct ib_udata *udata) +{ + int err; + struct rxe_srq *srq = to_rsrq(ibsrq); + struct rxe_dev *rxe = to_rdev(ibsrq->device); + + err = rxe_srq_chk_attr(rxe, srq, attr, mask); + if (err) + goto err1; + + err = rxe_srq_from_attr(rxe, srq, attr, mask, udata); + if (err) + goto err1; + + return 0; + +err1: + return err; +} + +static int rxe_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) +{ + struct rxe_srq *srq = to_rsrq(ibsrq); + + if (srq->error) + return -EINVAL; + + attr->max_wr = srq->rq.queue->buf->index_mask; + attr->max_sge = srq->rq.max_sge; + attr->srq_limit = srq->limit; + return 0; +} + +static int rxe_destroy_srq(struct ib_srq *ibsrq) +{ + struct rxe_srq *srq = to_rsrq(ibsrq); + + if (srq->rq.queue) + rxe_queue_cleanup(srq->rq.queue); + + rxe_drop_ref(srq->pd); + rxe_drop_index(srq); + rxe_drop_ref(srq); + + return 0; +} + +static int rxe_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + int err = 0; + unsigned long flags; + struct rxe_srq *srq = to_rsrq(ibsrq); + + spin_lock_irqsave(&srq->rq.producer_lock, flags); + + while (wr) { + err = post_one_recv(&srq->rq, wr); + if (unlikely(err)) + break; + wr = wr->next; + } + + spin_unlock_irqrestore(&srq->rq.producer_lock, flags); + + if (err) + *bad_wr = wr; + + return err; +} + +static struct ib_qp *rxe_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init, + struct ib_udata *udata) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_qp *qp; + + err = rxe_qp_chk_init(rxe, init); + if (err) + goto err1; + + qp = rxe_alloc(&rxe->qp_pool); + if (!qp) { + err = -ENOMEM; + goto err1; + } + + if (udata) { + if (udata->inlen) { + err = -EINVAL; + goto err1; + } + qp->is_user = 1; + } + + rxe_add_index(qp); + + err = rxe_qp_from_init(rxe, qp, pd, init, udata, ibpd); + if (err) + goto err2; + + return &qp->ibqp; + +err2: + rxe_drop_index(qp); + rxe_drop_ref(qp); +err1: + return ERR_PTR(err); +} + +static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int mask, struct ib_udata *udata) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibqp->device); + struct rxe_qp *qp = to_rqp(ibqp); + + err = rxe_qp_chk_attr(rxe, qp, attr, mask); + if (err) + goto err1; + + err = rxe_qp_from_attr(qp, attr, mask, udata); + if (err) + goto err1; + + return 0; + +err1: + return err; +} + +static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int mask, struct ib_qp_init_attr *init) +{ + struct rxe_qp *qp = to_rqp(ibqp); + + rxe_qp_to_init(qp, init); + rxe_qp_to_attr(qp, attr, mask); + + return 0; +} + +static int rxe_destroy_qp(struct ib_qp *ibqp) +{ + struct rxe_qp *qp = to_rqp(ibqp); + + rxe_qp_destroy(qp); + rxe_drop_index(qp); + rxe_drop_ref(qp); + return 0; +} + +static int validate_send_wr(struct rxe_qp *qp, struct ib_send_wr *ibwr, + unsigned int mask, unsigned int length) +{ + int num_sge = ibwr->num_sge; + struct rxe_sq *sq = &qp->sq; + + if (unlikely(num_sge > sq->max_sge)) + goto err1; + + if (unlikely(mask & WR_ATOMIC_MASK)) { + if (length < 8) + goto err1; + + if (atomic_wr(ibwr)->remote_addr & 0x7) + goto err1; + } + + if (unlikely((ibwr->send_flags & IB_SEND_INLINE) && + (length > sq->max_inline))) + goto err1; + + return 0; + +err1: + return -EINVAL; +} + +static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, + struct ib_send_wr *ibwr) +{ + wr->wr_id = ibwr->wr_id; + wr->num_sge = ibwr->num_sge; + wr->opcode = ibwr->opcode; + wr->send_flags = ibwr->send_flags; + + if (qp_type(qp) == IB_QPT_UD || + qp_type(qp) == IB_QPT_SMI || + qp_type(qp) == IB_QPT_GSI) { + wr->wr.ud.remote_qpn = ud_wr(ibwr)->remote_qpn; + wr->wr.ud.remote_qkey = ud_wr(ibwr)->remote_qkey; + if (qp_type(qp) == IB_QPT_GSI) + wr->wr.ud.pkey_index = ud_wr(ibwr)->pkey_index; + if (wr->opcode == IB_WR_SEND_WITH_IMM) + wr->ex.imm_data = ibwr->ex.imm_data; + } else { + switch (wr->opcode) { + case IB_WR_RDMA_WRITE_WITH_IMM: + wr->ex.imm_data = ibwr->ex.imm_data; + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr; + wr->wr.rdma.rkey = rdma_wr(ibwr)->rkey; + break; + case IB_WR_SEND_WITH_IMM: + wr->ex.imm_data = ibwr->ex.imm_data; + break; + case IB_WR_SEND_WITH_INV: + wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; + break; + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + wr->wr.atomic.remote_addr = + atomic_wr(ibwr)->remote_addr; + wr->wr.atomic.compare_add = + atomic_wr(ibwr)->compare_add; + wr->wr.atomic.swap = atomic_wr(ibwr)->swap; + wr->wr.atomic.rkey = atomic_wr(ibwr)->rkey; + break; + case IB_WR_LOCAL_INV: + wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; + break; + case IB_WR_REG_MR: + wr->wr.reg.mr = reg_wr(ibwr)->mr; + wr->wr.reg.key = reg_wr(ibwr)->key; + wr->wr.reg.access = reg_wr(ibwr)->access; + break; + default: + break; + } + } +} + +static int init_send_wqe(struct rxe_qp *qp, struct ib_send_wr *ibwr, + unsigned int mask, unsigned int length, + struct rxe_send_wqe *wqe) +{ + int num_sge = ibwr->num_sge; + struct ib_sge *sge; + int i; + u8 *p; + + init_send_wr(qp, &wqe->wr, ibwr); + + if (qp_type(qp) == IB_QPT_UD || + qp_type(qp) == IB_QPT_SMI || + qp_type(qp) == IB_QPT_GSI) + memcpy(&wqe->av, &to_rah(ud_wr(ibwr)->ah)->av, sizeof(wqe->av)); + + if (unlikely(ibwr->send_flags & IB_SEND_INLINE)) { + p = wqe->dma.inline_data; + + sge = ibwr->sg_list; + for (i = 0; i < num_sge; i++, sge++) { + if (qp->is_user && copy_from_user(p, (__user void *) + (uintptr_t)sge->addr, sge->length)) + return -EFAULT; + + else if (!qp->is_user) + memcpy(p, (void *)(uintptr_t)sge->addr, + sge->length); + + p += sge->length; + } + } else if (mask & WR_REG_MASK) { + wqe->mask = mask; + wqe->state = wqe_state_posted; + return 0; + } else + memcpy(wqe->dma.sge, ibwr->sg_list, + num_sge * sizeof(struct ib_sge)); + + wqe->iova = (mask & WR_ATOMIC_MASK) ? + atomic_wr(ibwr)->remote_addr : + rdma_wr(ibwr)->remote_addr; + wqe->mask = mask; + wqe->dma.length = length; + wqe->dma.resid = length; + wqe->dma.num_sge = num_sge; + wqe->dma.cur_sge = 0; + wqe->dma.sge_offset = 0; + wqe->state = wqe_state_posted; + wqe->ssn = atomic_add_return(1, &qp->ssn); + + return 0; +} + +static int post_one_send(struct rxe_qp *qp, struct ib_send_wr *ibwr, + unsigned mask, u32 length) +{ + int err; + struct rxe_sq *sq = &qp->sq; + struct rxe_send_wqe *send_wqe; + unsigned long flags; + + err = validate_send_wr(qp, ibwr, mask, length); + if (err) + return err; + + spin_lock_irqsave(&qp->sq.sq_lock, flags); + + if (unlikely(queue_full(sq->queue))) { + err = -ENOMEM; + goto err1; + } + + send_wqe = producer_addr(sq->queue); + + err = init_send_wqe(qp, ibwr, mask, length, send_wqe); + if (unlikely(err)) + goto err1; + + /* + * make sure all changes to the work queue are + * written before we update the producer pointer + */ + smp_wmb(); + + advance_producer(sq->queue); + spin_unlock_irqrestore(&qp->sq.sq_lock, flags); + + return 0; + +err1: + spin_unlock_irqrestore(&qp->sq.sq_lock, flags); + return err; +} + +static int rxe_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + int err = 0; + struct rxe_qp *qp = to_rqp(ibqp); + unsigned int mask; + unsigned int length = 0; + int i; + int must_sched; + + if (unlikely(!qp->valid)) { + *bad_wr = wr; + return -EINVAL; + } + + if (unlikely(qp->req.state < QP_STATE_READY)) { + *bad_wr = wr; + return -EINVAL; + } + + while (wr) { + mask = wr_opcode_mask(wr->opcode, qp); + if (unlikely(!mask)) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + if (unlikely((wr->send_flags & IB_SEND_INLINE) && + !(mask & WR_INLINE_MASK))) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + length = 0; + for (i = 0; i < wr->num_sge; i++) + length += wr->sg_list[i].length; + + err = post_one_send(qp, wr, mask, length); + + if (err) { + *bad_wr = wr; + break; + } + wr = wr->next; + } + + /* + * Must sched in case of GSI QP because ib_send_mad() hold irq lock, + * and the requester call ip_local_out_sk() that takes spin_lock_bh. + */ + must_sched = (qp_type(qp) == IB_QPT_GSI) || + (queue_count(qp->sq.queue) > 1); + + rxe_run_task(&qp->req.task, must_sched); + + return err; +} + +static int rxe_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + int err = 0; + struct rxe_qp *qp = to_rqp(ibqp); + struct rxe_rq *rq = &qp->rq; + unsigned long flags; + + if (unlikely((qp_state(qp) < IB_QPS_INIT) || !qp->valid)) { + *bad_wr = wr; + err = -EINVAL; + goto err1; + } + + if (unlikely(qp->srq)) { + *bad_wr = wr; + err = -EINVAL; + goto err1; + } + + spin_lock_irqsave(&rq->producer_lock, flags); + + while (wr) { + err = post_one_recv(rq, wr); + if (unlikely(err)) { + *bad_wr = wr; + break; + } + wr = wr->next; + } + + spin_unlock_irqrestore(&rq->producer_lock, flags); + +err1: + return err; +} + +static struct ib_cq *rxe_create_cq(struct ib_device *dev, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + int err; + struct rxe_dev *rxe = to_rdev(dev); + struct rxe_cq *cq; + + if (attr->flags) + return ERR_PTR(-EINVAL); + + err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector, udata); + if (err) + goto err1; + + cq = rxe_alloc(&rxe->cq_pool); + if (!cq) { + err = -ENOMEM; + goto err1; + } + + err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, + context, udata); + if (err) + goto err2; + + return &cq->ibcq; + +err2: + rxe_drop_ref(cq); +err1: + return ERR_PTR(err); +} + +static int rxe_destroy_cq(struct ib_cq *ibcq) +{ + struct rxe_cq *cq = to_rcq(ibcq); + + rxe_drop_ref(cq); + return 0; +} + +static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) +{ + int err; + struct rxe_cq *cq = to_rcq(ibcq); + struct rxe_dev *rxe = to_rdev(ibcq->device); + + err = rxe_cq_chk_attr(rxe, cq, cqe, 0, udata); + if (err) + goto err1; + + err = rxe_cq_resize_queue(cq, cqe, udata); + if (err) + goto err1; + + return 0; + +err1: + return err; +} + +static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + int i; + struct rxe_cq *cq = to_rcq(ibcq); + struct rxe_cqe *cqe; + unsigned long flags; + + spin_lock_irqsave(&cq->cq_lock, flags); + for (i = 0; i < num_entries; i++) { + cqe = queue_head(cq->queue); + if (!cqe) + break; + + memcpy(wc++, &cqe->ibwc, sizeof(*wc)); + advance_consumer(cq->queue); + } + spin_unlock_irqrestore(&cq->cq_lock, flags); + + return i; +} + +static int rxe_peek_cq(struct ib_cq *ibcq, int wc_cnt) +{ + struct rxe_cq *cq = to_rcq(ibcq); + int count = queue_count(cq->queue); + + return (count > wc_cnt) ? wc_cnt : count; +} + +static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct rxe_cq *cq = to_rcq(ibcq); + + if (cq->notify != IB_CQ_NEXT_COMP) + cq->notify = flags & IB_CQ_SOLICITED_MASK; + + return 0; +} + +static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) +{ + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_mem *mr; + int err; + + mr = rxe_alloc(&rxe->mr_pool); + if (!mr) { + err = -ENOMEM; + goto err1; + } + + rxe_add_index(mr); + + rxe_add_ref(pd); + + err = rxe_mem_init_dma(rxe, pd, access, mr); + if (err) + goto err2; + + return &mr->ibmr; + +err2: + rxe_drop_ref(pd); + rxe_drop_index(mr); + rxe_drop_ref(mr); +err1: + return ERR_PTR(err); +} + +static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, + u64 start, + u64 length, + u64 iova, + int access, struct ib_udata *udata) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_mem *mr; + + mr = rxe_alloc(&rxe->mr_pool); + if (!mr) { + err = -ENOMEM; + goto err2; + } + + rxe_add_index(mr); + + rxe_add_ref(pd); + + err = rxe_mem_init_user(rxe, pd, start, length, iova, + access, udata, mr); + if (err) + goto err3; + + return &mr->ibmr; + +err3: + rxe_drop_ref(pd); + rxe_drop_index(mr); + rxe_drop_ref(mr); +err2: + return ERR_PTR(err); +} + +static int rxe_dereg_mr(struct ib_mr *ibmr) +{ + struct rxe_mem *mr = to_rmr(ibmr); + + mr->state = RXE_MEM_STATE_ZOMBIE; + rxe_drop_ref(mr->pd); + rxe_drop_index(mr); + rxe_drop_ref(mr); + return 0; +} + +static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, + enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_mem *mr; + int err; + + if (mr_type != IB_MR_TYPE_MEM_REG) + return ERR_PTR(-EINVAL); + + mr = rxe_alloc(&rxe->mr_pool); + if (!mr) { + err = -ENOMEM; + goto err1; + } + + rxe_add_index(mr); + + rxe_add_ref(pd); + + err = rxe_mem_init_fast(rxe, pd, max_num_sg, mr); + if (err) + goto err2; + + return &mr->ibmr; + +err2: + rxe_drop_ref(pd); + rxe_drop_index(mr); + rxe_drop_ref(mr); +err1: + return ERR_PTR(err); +} + +static int rxe_set_page(struct ib_mr *ibmr, u64 addr) +{ + struct rxe_mem *mr = to_rmr(ibmr); + struct rxe_map *map; + struct rxe_phys_buf *buf; + + if (unlikely(mr->nbuf == mr->num_buf)) + return -ENOMEM; + + map = mr->map[mr->nbuf / RXE_BUF_PER_MAP]; + buf = &map->buf[mr->nbuf % RXE_BUF_PER_MAP]; + + buf->addr = addr; + buf->size = ibmr->page_size; + mr->nbuf++; + + return 0; +} + +static int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) +{ + struct rxe_mem *mr = to_rmr(ibmr); + int n; + + mr->nbuf = 0; + + n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rxe_set_page); + + mr->va = ibmr->iova; + mr->iova = ibmr->iova; + mr->length = ibmr->length; + mr->page_shift = ilog2(ibmr->page_size); + mr->page_mask = ibmr->page_size - 1; + mr->offset = mr->iova & mr->page_mask; + + return n; +} + +static int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibqp->device); + struct rxe_qp *qp = to_rqp(ibqp); + struct rxe_mc_grp *grp; + + /* takes a ref on grp if successful */ + err = rxe_mcast_get_grp(rxe, mgid, &grp); + if (err) + return err; + + err = rxe_mcast_add_grp_elem(rxe, qp, grp); + + rxe_drop_ref(grp); + return err; +} + +static int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) +{ + struct rxe_dev *rxe = to_rdev(ibqp->device); + struct rxe_qp *qp = to_rqp(ibqp); + + return rxe_mcast_drop_grp_elem(rxe, qp, mgid); +} + +static ssize_t rxe_show_parent(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct rxe_dev *rxe = container_of(device, struct rxe_dev, + ib_dev.dev); + char *name; + + name = rxe->ifc_ops->parent_name(rxe, 1); + return snprintf(buf, 16, "%s\n", name); +} + +static DEVICE_ATTR(parent, S_IRUGO, rxe_show_parent, NULL); + +static struct device_attribute *rxe_dev_attributes[] = { + &dev_attr_parent, +}; + +int rxe_register_device(struct rxe_dev *rxe) +{ + int err; + int i; + struct ib_device *dev = &rxe->ib_dev; + + strlcpy(dev->name, "rxe%d", IB_DEVICE_NAME_MAX); + strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc)); + + dev->owner = THIS_MODULE; + dev->node_type = RDMA_NODE_IB_CA; + dev->phys_port_cnt = 1; + dev->num_comp_vectors = RXE_NUM_COMP_VECTORS; + dev->dma_device = rxe->ifc_ops->dma_device(rxe); + dev->local_dma_lkey = 0; + dev->node_guid = rxe->ifc_ops->node_guid(rxe); + dev->dma_ops = &rxe_dma_mapping_ops; + + dev->uverbs_abi_ver = RXE_UVERBS_ABI_VERSION; + dev->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) + | BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) + | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) + | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) + | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) + | BIT_ULL(IB_USER_VERBS_CMD_POST_SRQ_RECV) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) + | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) + | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) + | BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) + | BIT_ULL(IB_USER_VERBS_CMD_POST_RECV) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_POLL_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_PEEK_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_REG_MR) + | BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) + | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_AH) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_AH) + | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) + | BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) + | BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) + ; + + dev->query_device = rxe_query_device; + dev->modify_device = rxe_modify_device; + dev->query_port = rxe_query_port; + dev->modify_port = rxe_modify_port; + dev->get_link_layer = rxe_get_link_layer; + dev->query_gid = rxe_query_gid; + dev->get_netdev = rxe_get_netdev; + dev->add_gid = rxe_add_gid; + dev->del_gid = rxe_del_gid; + dev->query_pkey = rxe_query_pkey; + dev->alloc_ucontext = rxe_alloc_ucontext; + dev->dealloc_ucontext = rxe_dealloc_ucontext; + dev->mmap = rxe_mmap; + dev->get_port_immutable = rxe_port_immutable; + dev->alloc_pd = rxe_alloc_pd; + dev->dealloc_pd = rxe_dealloc_pd; + dev->create_ah = rxe_create_ah; + dev->modify_ah = rxe_modify_ah; + dev->query_ah = rxe_query_ah; + dev->destroy_ah = rxe_destroy_ah; + dev->create_srq = rxe_create_srq; + dev->modify_srq = rxe_modify_srq; + dev->query_srq = rxe_query_srq; + dev->destroy_srq = rxe_destroy_srq; + dev->post_srq_recv = rxe_post_srq_recv; + dev->create_qp = rxe_create_qp; + dev->modify_qp = rxe_modify_qp; + dev->query_qp = rxe_query_qp; + dev->destroy_qp = rxe_destroy_qp; + dev->post_send = rxe_post_send; + dev->post_recv = rxe_post_recv; + dev->create_cq = rxe_create_cq; + dev->destroy_cq = rxe_destroy_cq; + dev->resize_cq = rxe_resize_cq; + dev->poll_cq = rxe_poll_cq; + dev->peek_cq = rxe_peek_cq; + dev->req_notify_cq = rxe_req_notify_cq; + dev->get_dma_mr = rxe_get_dma_mr; + dev->reg_user_mr = rxe_reg_user_mr; + dev->dereg_mr = rxe_dereg_mr; + dev->alloc_mr = rxe_alloc_mr; + dev->map_mr_sg = rxe_map_mr_sg; + dev->attach_mcast = rxe_attach_mcast; + dev->detach_mcast = rxe_detach_mcast; + + err = ib_register_device(dev, NULL); + if (err) { + pr_warn("rxe_register_device failed, err = %d\n", err); + goto err1; + } + + for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i) { + err = device_create_file(&dev->dev, rxe_dev_attributes[i]); + if (err) { + pr_warn("device_create_file failed, i = %d, err = %d\n", + i, err); + goto err2; + } + } + + return 0; + +err2: + ib_unregister_device(dev); +err1: + return err; +} + +int rxe_unregister_device(struct rxe_dev *rxe) +{ + int i; + struct ib_device *dev = &rxe->ib_dev; + + for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i) + device_remove_file(&dev->dev, rxe_dev_attributes[i]); + + ib_unregister_device(dev); + + return 0; +} diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h new file mode 100644 index 000000000000..cac1d52a08f0 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -0,0 +1,480 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_VERBS_H +#define RXE_VERBS_H + +#include <linux/interrupt.h> +#include <rdma/rdma_user_rxe.h> +#include "rxe_pool.h" +#include "rxe_task.h" + +static inline int pkey_match(u16 key1, u16 key2) +{ + return (((key1 & 0x7fff) != 0) && + ((key1 & 0x7fff) == (key2 & 0x7fff)) && + ((key1 & 0x8000) || (key2 & 0x8000))) ? 1 : 0; +} + +/* Return >0 if psn_a > psn_b + * 0 if psn_a == psn_b + * <0 if psn_a < psn_b + */ +static inline int psn_compare(u32 psn_a, u32 psn_b) +{ + s32 diff; + + diff = (psn_a - psn_b) << 8; + return diff; +} + +struct rxe_ucontext { + struct rxe_pool_entry pelem; + struct ib_ucontext ibuc; +}; + +struct rxe_pd { + struct rxe_pool_entry pelem; + struct ib_pd ibpd; +}; + +struct rxe_ah { + struct rxe_pool_entry pelem; + struct ib_ah ibah; + struct rxe_pd *pd; + struct rxe_av av; +}; + +struct rxe_cqe { + union { + struct ib_wc ibwc; + struct ib_uverbs_wc uibwc; + }; +}; + +struct rxe_cq { + struct rxe_pool_entry pelem; + struct ib_cq ibcq; + struct rxe_queue *queue; + spinlock_t cq_lock; + u8 notify; + int is_user; + struct tasklet_struct comp_task; +}; + +enum wqe_state { + wqe_state_posted, + wqe_state_processing, + wqe_state_pending, + wqe_state_done, + wqe_state_error, +}; + +struct rxe_sq { + int max_wr; + int max_sge; + int max_inline; + spinlock_t sq_lock; /* guard queue */ + struct rxe_queue *queue; +}; + +struct rxe_rq { + int max_wr; + int max_sge; + spinlock_t producer_lock; /* guard queue producer */ + spinlock_t consumer_lock; /* guard queue consumer */ + struct rxe_queue *queue; +}; + +struct rxe_srq { + struct rxe_pool_entry pelem; + struct ib_srq ibsrq; + struct rxe_pd *pd; + struct rxe_rq rq; + u32 srq_num; + + int limit; + int error; +}; + +enum rxe_qp_state { + QP_STATE_RESET, + QP_STATE_INIT, + QP_STATE_READY, + QP_STATE_DRAIN, /* req only */ + QP_STATE_DRAINED, /* req only */ + QP_STATE_ERROR +}; + +extern char *rxe_qp_state_name[]; + +struct rxe_req_info { + enum rxe_qp_state state; + int wqe_index; + u32 psn; + int opcode; + atomic_t rd_atomic; + int wait_fence; + int need_rd_atomic; + int wait_psn; + int need_retry; + int noack_pkts; + struct rxe_task task; +}; + +struct rxe_comp_info { + u32 psn; + int opcode; + int timeout; + int timeout_retry; + u32 retry_cnt; + u32 rnr_retry; + struct rxe_task task; +}; + +enum rdatm_res_state { + rdatm_res_state_next, + rdatm_res_state_new, + rdatm_res_state_replay, +}; + +struct resp_res { + int type; + u32 first_psn; + u32 last_psn; + u32 cur_psn; + enum rdatm_res_state state; + + union { + struct { + struct sk_buff *skb; + } atomic; + struct { + struct rxe_mem *mr; + u64 va_org; + u32 rkey; + u32 length; + u64 va; + u32 resid; + } read; + }; +}; + +struct rxe_resp_info { + enum rxe_qp_state state; + u32 msn; + u32 psn; + int opcode; + int drop_msg; + int goto_error; + int sent_psn_nak; + enum ib_wc_status status; + u8 aeth_syndrome; + + /* Receive only */ + struct rxe_recv_wqe *wqe; + + /* RDMA read / atomic only */ + u64 va; + struct rxe_mem *mr; + u32 resid; + u32 rkey; + u64 atomic_orig; + + /* SRQ only */ + struct { + struct rxe_recv_wqe wqe; + struct ib_sge sge[RXE_MAX_SGE]; + } srq_wqe; + + /* Responder resources. It's a circular list where the oldest + * resource is dropped first. + */ + struct resp_res *resources; + unsigned int res_head; + unsigned int res_tail; + struct resp_res *res; + struct rxe_task task; +}; + +struct rxe_qp { + struct rxe_pool_entry pelem; + struct ib_qp ibqp; + struct ib_qp_attr attr; + unsigned int valid; + unsigned int mtu; + int is_user; + + struct rxe_pd *pd; + struct rxe_srq *srq; + struct rxe_cq *scq; + struct rxe_cq *rcq; + + enum ib_sig_type sq_sig_type; + + struct rxe_sq sq; + struct rxe_rq rq; + + struct socket *sk; + + struct rxe_av pri_av; + struct rxe_av alt_av; + + /* list of mcast groups qp has joined (for cleanup) */ + struct list_head grp_list; + spinlock_t grp_lock; /* guard grp_list */ + + struct sk_buff_head req_pkts; + struct sk_buff_head resp_pkts; + struct sk_buff_head send_pkts; + + struct rxe_req_info req; + struct rxe_comp_info comp; + struct rxe_resp_info resp; + + atomic_t ssn; + atomic_t skb_out; + int need_req_skb; + + /* Timer for retranmitting packet when ACKs have been lost. RC + * only. The requester sets it when it is not already + * started. The responder resets it whenever an ack is + * received. + */ + struct timer_list retrans_timer; + u64 qp_timeout_jiffies; + + /* Timer for handling RNR NAKS. */ + struct timer_list rnr_nak_timer; + + spinlock_t state_lock; /* guard requester and completer */ +}; + +enum rxe_mem_state { + RXE_MEM_STATE_ZOMBIE, + RXE_MEM_STATE_INVALID, + RXE_MEM_STATE_FREE, + RXE_MEM_STATE_VALID, +}; + +enum rxe_mem_type { + RXE_MEM_TYPE_NONE, + RXE_MEM_TYPE_DMA, + RXE_MEM_TYPE_MR, + RXE_MEM_TYPE_FMR, + RXE_MEM_TYPE_MW, +}; + +#define RXE_BUF_PER_MAP (PAGE_SIZE / sizeof(struct rxe_phys_buf)) + +struct rxe_phys_buf { + u64 addr; + u64 size; +}; + +struct rxe_map { + struct rxe_phys_buf buf[RXE_BUF_PER_MAP]; +}; + +struct rxe_mem { + struct rxe_pool_entry pelem; + union { + struct ib_mr ibmr; + struct ib_mw ibmw; + }; + + struct rxe_pd *pd; + struct ib_umem *umem; + + u32 lkey; + u32 rkey; + + enum rxe_mem_state state; + enum rxe_mem_type type; + u64 va; + u64 iova; + size_t length; + u32 offset; + int access; + + int page_shift; + int page_mask; + int map_shift; + int map_mask; + + u32 num_buf; + u32 nbuf; + + u32 max_buf; + u32 num_map; + + struct rxe_map **map; +}; + +struct rxe_mc_grp { + struct rxe_pool_entry pelem; + spinlock_t mcg_lock; /* guard group */ + struct rxe_dev *rxe; + struct list_head qp_list; + union ib_gid mgid; + int num_qp; + u32 qkey; + u16 pkey; +}; + +struct rxe_mc_elem { + struct rxe_pool_entry pelem; + struct list_head qp_list; + struct list_head grp_list; + struct rxe_qp *qp; + struct rxe_mc_grp *grp; +}; + +struct rxe_port { + struct ib_port_attr attr; + u16 *pkey_tbl; + __be64 port_guid; + __be64 subnet_prefix; + spinlock_t port_lock; /* guard port */ + unsigned int mtu_cap; + /* special QPs */ + u32 qp_smi_index; + u32 qp_gsi_index; +}; + +/* callbacks from rdma_rxe to network interface layer */ +struct rxe_ifc_ops { + void (*release)(struct rxe_dev *rxe); + __be64 (*node_guid)(struct rxe_dev *rxe); + __be64 (*port_guid)(struct rxe_dev *rxe); + struct device *(*dma_device)(struct rxe_dev *rxe); + int (*mcast_add)(struct rxe_dev *rxe, union ib_gid *mgid); + int (*mcast_delete)(struct rxe_dev *rxe, union ib_gid *mgid); + int (*prepare)(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + struct sk_buff *skb, u32 *crc); + int (*send)(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + struct sk_buff *skb); + int (*loopback)(struct sk_buff *skb); + struct sk_buff *(*init_packet)(struct rxe_dev *rxe, struct rxe_av *av, + int paylen, struct rxe_pkt_info *pkt); + char *(*parent_name)(struct rxe_dev *rxe, unsigned int port_num); + enum rdma_link_layer (*link_layer)(struct rxe_dev *rxe, + unsigned int port_num); +}; + +struct rxe_dev { + struct ib_device ib_dev; + struct ib_device_attr attr; + int max_ucontext; + int max_inline_data; + struct kref ref_cnt; + struct mutex usdev_lock; + + struct rxe_ifc_ops *ifc_ops; + + struct net_device *ndev; + + int xmit_errors; + + struct rxe_pool uc_pool; + struct rxe_pool pd_pool; + struct rxe_pool ah_pool; + struct rxe_pool srq_pool; + struct rxe_pool qp_pool; + struct rxe_pool cq_pool; + struct rxe_pool mr_pool; + struct rxe_pool mw_pool; + struct rxe_pool mc_grp_pool; + struct rxe_pool mc_elem_pool; + + spinlock_t pending_lock; /* guard pending_mmaps */ + struct list_head pending_mmaps; + + spinlock_t mmap_offset_lock; /* guard mmap_offset */ + int mmap_offset; + + struct rxe_port port; + struct list_head list; +}; + +static inline struct rxe_dev *to_rdev(struct ib_device *dev) +{ + return dev ? container_of(dev, struct rxe_dev, ib_dev) : NULL; +} + +static inline struct rxe_ucontext *to_ruc(struct ib_ucontext *uc) +{ + return uc ? container_of(uc, struct rxe_ucontext, ibuc) : NULL; +} + +static inline struct rxe_pd *to_rpd(struct ib_pd *pd) +{ + return pd ? container_of(pd, struct rxe_pd, ibpd) : NULL; +} + +static inline struct rxe_ah *to_rah(struct ib_ah *ah) +{ + return ah ? container_of(ah, struct rxe_ah, ibah) : NULL; +} + +static inline struct rxe_srq *to_rsrq(struct ib_srq *srq) +{ + return srq ? container_of(srq, struct rxe_srq, ibsrq) : NULL; +} + +static inline struct rxe_qp *to_rqp(struct ib_qp *qp) +{ + return qp ? container_of(qp, struct rxe_qp, ibqp) : NULL; +} + +static inline struct rxe_cq *to_rcq(struct ib_cq *cq) +{ + return cq ? container_of(cq, struct rxe_cq, ibcq) : NULL; +} + +static inline struct rxe_mem *to_rmr(struct ib_mr *mr) +{ + return mr ? container_of(mr, struct rxe_mem, ibmr) : NULL; +} + +static inline struct rxe_mem *to_rmw(struct ib_mw *mw) +{ + return mw ? container_of(mw, struct rxe_mem, ibmw) : NULL; +} + +int rxe_register_device(struct rxe_dev *rxe); +int rxe_unregister_device(struct rxe_dev *rxe); + +void rxe_mc_cleanup(void *arg); + +#endif /* RXE_VERBS_H */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 1502199c8e56..7b6d40ff1acf 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -62,10 +62,8 @@ static void ipoib_get_drvinfo(struct net_device *netdev, { struct ipoib_dev_priv *priv = netdev_priv(netdev); - snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), - "%d.%d.%d", (int)(priv->ca->attrs.fw_ver >> 32), - (int)(priv->ca->attrs.fw_ver >> 16) & 0xffff, - (int)priv->ca->attrs.fw_ver & 0xffff); + ib_get_device_fw_str(priv->ca, drvinfo->fw_version, + sizeof(drvinfo->fw_version)); strlcpy(drvinfo->bus_info, dev_name(priv->ca->dma_device), sizeof(drvinfo->bus_info)); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 5f58c41ef787..74bcaa064226 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1967,8 +1967,7 @@ int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) priv->hca_caps = hca->attrs.device_cap_flags; if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { - priv->dev->hw_features = NETIF_F_SG | - NETIF_F_IP_CSUM | NETIF_F_RXCSUM; + priv->dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_RXCSUM; if (priv->hca_caps & IB_DEVICE_UD_TSO) priv->dev->hw_features |= NETIF_F_TSO; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 1e7cbbaa15bd..c55ecb2c3736 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -135,7 +135,8 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) .cap = { .max_send_wr = ipoib_sendq_size, .max_recv_wr = ipoib_recvq_size, - .max_send_sge = 1, + .max_send_sge = min_t(u32, priv->ca->attrs.max_sge, + MAX_SKB_FRAGS + 1), .max_recv_sge = IPOIB_UD_RX_SG }, .sq_sig_type = IB_SIGNAL_ALL_WR, @@ -205,10 +206,6 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) if (priv->hca_caps & IB_DEVICE_MANAGED_FLOW_STEERING) init_attr.create_flags |= IB_QP_CREATE_NETIF_QP; - if (dev->features & NETIF_F_SG) - init_attr.cap.max_send_sge = - min_t(u32, priv->ca->attrs.max_sge, MAX_SKB_FRAGS + 1); - priv->qp = ib_create_qp(priv->pd, &init_attr); if (IS_ERR(priv->qp)) { printk(KERN_WARNING "%s: failed to create QP\n", ca->name); @@ -234,6 +231,9 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) priv->rx_wr.next = NULL; priv->rx_wr.sg_list = priv->rx_sge; + if (init_attr.cap.max_send_sge > 1) + dev->features |= NETIF_F_SG; + priv->max_send_sge = init_attr.cap.max_send_sge; return 0; diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index a990c04208c9..ba6be060a476 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -137,8 +137,6 @@ isert_create_qp(struct isert_conn *isert_conn, attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1; attr.cap.max_rdma_ctxs = ISCSI_DEF_XMIT_CMDS_MAX; attr.cap.max_send_sge = device->ib_device->attrs.max_sge; - isert_conn->max_sge = min(device->ib_device->attrs.max_sge, - device->ib_device->attrs.max_sge_rd); attr.cap.max_recv_sge = 1; attr.sq_sig_type = IB_SIGNAL_REQ_WR; attr.qp_type = IB_QPT_RC; diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h index e512ba941f2f..fc791efe3a10 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.h +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -138,7 +138,6 @@ struct isert_conn { u32 responder_resources; u32 initiator_depth; bool pi_support; - u32 max_sge; struct iser_rx_desc *login_req_buf; char *login_rsp_buf; u64 login_req_dma; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 4a4155640d51..dfa23b075a88 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -1601,6 +1601,7 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch) struct ib_qp_init_attr *qp_init; struct srpt_port *sport = ch->sport; struct srpt_device *sdev = sport->sdev; + const struct ib_device_attr *attrs = &sdev->device->attrs; u32 srp_sq_size = sport->port_attrib.srp_sq_size; int ret; @@ -1638,7 +1639,7 @@ retry: */ qp_init->cap.max_send_wr = srp_sq_size / 2; qp_init->cap.max_rdma_ctxs = srp_sq_size / 2; - qp_init->cap.max_send_sge = SRPT_DEF_SG_PER_WQE; + qp_init->cap.max_send_sge = min(attrs->max_sge, SRPT_MAX_SG_PER_WQE); qp_init->port_num = ch->sport->port; ch->qp = ib_create_qp(sdev->pd, qp_init); @@ -2261,7 +2262,7 @@ static void srpt_queue_response(struct se_cmd *cmd) container_of(cmd, struct srpt_send_ioctx, cmd); struct srpt_rdma_ch *ch = ioctx->ch; struct srpt_device *sdev = ch->sport->sdev; - struct ib_send_wr send_wr, *first_wr = NULL, *bad_wr; + struct ib_send_wr send_wr, *first_wr = &send_wr, *bad_wr; struct ib_sge sge; enum srpt_command_state state; unsigned long flags; @@ -2302,11 +2303,8 @@ static void srpt_queue_response(struct se_cmd *cmd) struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i]; first_wr = rdma_rw_ctx_wrs(&ctx->rw, ch->qp, - ch->sport->port, NULL, - first_wr ? first_wr : &send_wr); + ch->sport->port, NULL, first_wr); } - } else { - first_wr = &send_wr; } if (state != SRPT_STATE_MGMT) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 389030487da7..581878782854 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -106,7 +106,11 @@ enum { SRP_LOGIN_RSP_MULTICHAN_MAINTAINED = 0x2, SRPT_DEF_SG_TABLESIZE = 128, - SRPT_DEF_SG_PER_WQE = 16, + /* + * An experimentally determined value that avoids that QP creation + * fails due to "swiotlb buffer is full" on systems using the swiotlb. + */ + SRPT_MAX_SG_PER_WQE = 16, MIN_SRPT_SQ_SIZE = 16, DEF_SRPT_SQ_SIZE = 4096, diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index a529a4535457..83af17ad0f1f 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -115,6 +115,10 @@ static bool sticks_to_null; module_param(sticks_to_null, bool, S_IRUGO); MODULE_PARM_DESC(sticks_to_null, "Do not map sticks at all for unknown pads"); +static bool auto_poweroff = true; +module_param(auto_poweroff, bool, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(auto_poweroff, "Power off wireless controllers on suspend"); + static const struct xpad_device { u16 idVendor; u16 idProduct; @@ -1248,6 +1252,36 @@ static void xpad_stop_input(struct usb_xpad *xpad) usb_kill_urb(xpad->irq_in); } +static void xpad360w_poweroff_controller(struct usb_xpad *xpad) +{ + unsigned long flags; + struct xpad_output_packet *packet = + &xpad->out_packets[XPAD_OUT_CMD_IDX]; + + spin_lock_irqsave(&xpad->odata_lock, flags); + + packet->data[0] = 0x00; + packet->data[1] = 0x00; + packet->data[2] = 0x08; + packet->data[3] = 0xC0; + packet->data[4] = 0x00; + packet->data[5] = 0x00; + packet->data[6] = 0x00; + packet->data[7] = 0x00; + packet->data[8] = 0x00; + packet->data[9] = 0x00; + packet->data[10] = 0x00; + packet->data[11] = 0x00; + packet->len = 12; + packet->pending = true; + + /* Reset the sequence so we send out poweroff now */ + xpad->last_out_packet = -1; + xpad_try_sending_next_out_packet(xpad); + + spin_unlock_irqrestore(&xpad->odata_lock, flags); +} + static int xpad360w_start_input(struct usb_xpad *xpad) { int error; @@ -1590,6 +1624,15 @@ static int xpad_suspend(struct usb_interface *intf, pm_message_t message) * or goes away. */ xpad360w_stop_input(xpad); + + /* + * The wireless adapter is going off now, so the + * gamepads are going to become disconnected. + * Unless explicitly disabled, power them down + * so they don't just sit there flashing. + */ + if (auto_poweroff && xpad->pad_present) + xpad360w_poweroff_controller(xpad); } else { mutex_lock(&input->mutex); if (input->users) diff --git a/drivers/input/keyboard/cros_ec_keyb.c b/drivers/input/keyboard/cros_ec_keyb.c index b01966dc7eb3..4b0878f35471 100644 --- a/drivers/input/keyboard/cros_ec_keyb.c +++ b/drivers/input/keyboard/cros_ec_keyb.c @@ -186,7 +186,7 @@ static irqreturn_t cros_ec_keyb_irq(int irq, void *data) if (ret >= 0) cros_ec_keyb_process(ckdev, kb_state, ret); else - dev_err(ec->dev, "failed to get keyboard state: %d\n", ret); + dev_err(ckdev->dev, "failed to get keyboard state: %d\n", ret); return IRQ_HANDLED; } @@ -236,7 +236,7 @@ static void cros_ec_keyb_compute_valid_keys(struct cros_ec_keyb *ckdev) static int cros_ec_keyb_probe(struct platform_device *pdev) { struct cros_ec_device *ec = dev_get_drvdata(pdev->dev.parent); - struct device *dev = ec->dev; + struct device *dev = &pdev->dev; struct cros_ec_keyb *ckdev; struct input_dev *idev; struct device_node *np; @@ -246,23 +246,22 @@ static int cros_ec_keyb_probe(struct platform_device *pdev) if (!np) return -ENODEV; - ckdev = devm_kzalloc(&pdev->dev, sizeof(*ckdev), GFP_KERNEL); + ckdev = devm_kzalloc(dev, sizeof(*ckdev), GFP_KERNEL); if (!ckdev) return -ENOMEM; - err = matrix_keypad_parse_of_params(&pdev->dev, &ckdev->rows, - &ckdev->cols); + err = matrix_keypad_parse_of_params(dev, &ckdev->rows, &ckdev->cols); if (err) return err; - ckdev->valid_keys = devm_kzalloc(&pdev->dev, ckdev->cols, GFP_KERNEL); + ckdev->valid_keys = devm_kzalloc(dev, ckdev->cols, GFP_KERNEL); if (!ckdev->valid_keys) return -ENOMEM; - ckdev->old_kb_state = devm_kzalloc(&pdev->dev, ckdev->cols, GFP_KERNEL); + ckdev->old_kb_state = devm_kzalloc(dev, ckdev->cols, GFP_KERNEL); if (!ckdev->old_kb_state) return -ENOMEM; - idev = devm_input_allocate_device(&pdev->dev); + idev = devm_input_allocate_device(dev); if (!idev) return -ENOMEM; @@ -273,7 +272,7 @@ static int cros_ec_keyb_probe(struct platform_device *pdev) ckdev->ec = ec; ckdev->dev = dev; - dev_set_drvdata(&pdev->dev, ckdev); + dev_set_drvdata(dev, ckdev); idev->name = CROS_EC_DEV_NAME; idev->phys = ec->phys_name; @@ -282,7 +281,7 @@ static int cros_ec_keyb_probe(struct platform_device *pdev) idev->id.bustype = BUS_VIRTUAL; idev->id.version = 1; idev->id.product = 0; - idev->dev.parent = &pdev->dev; + idev->dev.parent = dev; idev->open = cros_ec_keyb_open; idev->close = cros_ec_keyb_close; diff --git a/drivers/input/misc/rotary_encoder.c b/drivers/input/misc/rotary_encoder.c index c7fc8d4fb080..1588aecafff7 100644 --- a/drivers/input/misc/rotary_encoder.c +++ b/drivers/input/misc/rotary_encoder.c @@ -28,6 +28,11 @@ #define DRV_NAME "rotary-encoder" +enum rotary_encoder_encoding { + ROTENC_GRAY, + ROTENC_BINARY, +}; + struct rotary_encoder { struct input_dev *input; @@ -37,6 +42,7 @@ struct rotary_encoder { u32 axis; bool relative_axis; bool rollover; + enum rotary_encoder_encoding encoding; unsigned int pos; @@ -57,8 +63,9 @@ static unsigned int rotary_encoder_get_state(struct rotary_encoder *encoder) for (i = 0; i < encoder->gpios->ndescs; ++i) { int val = gpiod_get_value_cansleep(encoder->gpios->desc[i]); + /* convert from gray encoding to normal */ - if (ret & 1) + if (encoder->encoding == ROTENC_GRAY && ret & 1) val = !val; ret = ret << 1 | val; @@ -213,6 +220,20 @@ static int rotary_encoder_probe(struct platform_device *pdev) encoder->rollover = device_property_read_bool(dev, "rotary-encoder,rollover"); + if (!device_property_present(dev, "rotary-encoder,encoding") || + !device_property_match_string(dev, "rotary-encoder,encoding", + "gray")) { + dev_info(dev, "gray"); + encoder->encoding = ROTENC_GRAY; + } else if (!device_property_match_string(dev, "rotary-encoder,encoding", + "binary")) { + dev_info(dev, "binary"); + encoder->encoding = ROTENC_BINARY; + } else { + dev_err(dev, "unknown encoding setting\n"); + return -EINVAL; + } + device_property_read_u32(dev, "linux,axis", &encoder->axis); encoder->relative_axis = device_property_read_bool(dev, "rotary-encoder,relative-axis"); diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c index 2f589857a039..d15b33813021 100644 --- a/drivers/input/mouse/elan_i2c_core.c +++ b/drivers/input/mouse/elan_i2c_core.c @@ -4,7 +4,8 @@ * Copyright (c) 2013 ELAN Microelectronics Corp. * * Author: æž—æ”¿ç¶ (Duson Lin) <dusonlin@emc.com.tw> - * Version: 1.6.0 + * Author: KT Liao <kt.liao@emc.com.tw> + * Version: 1.6.2 * * Based on cyapa driver: * copyright (c) 2011-2012 Cypress Semiconductor, Inc. @@ -40,7 +41,7 @@ #include "elan_i2c.h" #define DRIVER_NAME "elan_i2c" -#define ELAN_DRIVER_VERSION "1.6.1" +#define ELAN_DRIVER_VERSION "1.6.2" #define ELAN_VENDOR_ID 0x04f3 #define ETP_MAX_PRESSURE 255 #define ETP_FWIDTH_REDUCE 90 @@ -199,9 +200,41 @@ static int elan_sleep(struct elan_tp_data *data) return error; } +static int elan_query_product(struct elan_tp_data *data) +{ + int error; + + error = data->ops->get_product_id(data->client, &data->product_id); + if (error) + return error; + + error = data->ops->get_sm_version(data->client, &data->ic_type, + &data->sm_version); + if (error) + return error; + + return 0; +} + +static int elan_check_ASUS_special_fw(struct elan_tp_data *data) +{ + if (data->ic_type != 0x0E) + return false; + + switch (data->product_id) { + case 0x05 ... 0x07: + case 0x09: + case 0x13: + return true; + default: + return false; + } +} + static int __elan_initialize(struct elan_tp_data *data) { struct i2c_client *client = data->client; + bool woken_up = false; int error; error = data->ops->initialize(client); @@ -210,6 +243,27 @@ static int __elan_initialize(struct elan_tp_data *data) return error; } + error = elan_query_product(data); + if (error) + return error; + + /* + * Some ASUS devices were shipped with firmware that requires + * touchpads to be woken up first, before attempting to switch + * them into absolute reporting mode. + */ + if (elan_check_ASUS_special_fw(data)) { + error = data->ops->sleep_control(client, false); + if (error) { + dev_err(&client->dev, + "failed to wake device up: %d\n", error); + return error; + } + + msleep(200); + woken_up = true; + } + data->mode |= ETP_ENABLE_ABS; error = data->ops->set_mode(client, data->mode); if (error) { @@ -218,11 +272,13 @@ static int __elan_initialize(struct elan_tp_data *data) return error; } - error = data->ops->sleep_control(client, false); - if (error) { - dev_err(&client->dev, - "failed to wake device up: %d\n", error); - return error; + if (!woken_up) { + error = data->ops->sleep_control(client, false); + if (error) { + dev_err(&client->dev, + "failed to wake device up: %d\n", error); + return error; + } } return 0; @@ -248,10 +304,6 @@ static int elan_query_device_info(struct elan_tp_data *data) { int error; - error = data->ops->get_product_id(data->client, &data->product_id); - if (error) - return error; - error = data->ops->get_version(data->client, false, &data->fw_version); if (error) return error; @@ -261,11 +313,6 @@ static int elan_query_device_info(struct elan_tp_data *data) if (error) return error; - error = data->ops->get_sm_version(data->client, &data->ic_type, - &data->sm_version); - if (error) - return error; - error = data->ops->get_version(data->client, true, &data->iap_version); if (error) return error; diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c index 615d23ec0d8e..08e252a42480 100644 --- a/drivers/input/mouse/elantech.c +++ b/drivers/input/mouse/elantech.c @@ -222,12 +222,8 @@ static int elantech_write_reg(struct psmouse *psmouse, unsigned char reg, */ static void elantech_packet_dump(struct psmouse *psmouse) { - int i; - - psmouse_printk(KERN_DEBUG, psmouse, "PS/2 packet ["); - for (i = 0; i < psmouse->pktsize; i++) - printk("%s0x%02x ", i ? ", " : " ", psmouse->packet[i]); - printk("]\n"); + psmouse_printk(KERN_DEBUG, psmouse, "PS/2 packet [%*ph]\n", + psmouse->pktsize, psmouse->packet); } /* diff --git a/drivers/input/rmi4/rmi_bus.c b/drivers/input/rmi4/rmi_bus.c index 253df96be427..a73580654c6b 100644 --- a/drivers/input/rmi4/rmi_bus.c +++ b/drivers/input/rmi4/rmi_bus.c @@ -232,10 +232,7 @@ err_put_device: void rmi_unregister_function(struct rmi_function *fn) { device_del(&fn->dev); - - if (fn->dev.of_node) - of_node_put(fn->dev.of_node); - + of_node_put(fn->dev.of_node); put_device(&fn->dev); } diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c index 454195709a82..b4d34086e73f 100644 --- a/drivers/input/serio/i8042.c +++ b/drivers/input/serio/i8042.c @@ -1277,6 +1277,7 @@ static int __init i8042_create_kbd_port(void) serio->start = i8042_start; serio->stop = i8042_stop; serio->close = i8042_port_close; + serio->ps2_cmd_mutex = &i8042_mutex; serio->port_data = port; serio->dev.parent = &i8042_platform_device->dev; strlcpy(serio->name, "i8042 KBD port", sizeof(serio->name)); @@ -1373,21 +1374,6 @@ static void i8042_unregister_ports(void) } } -/* - * Checks whether port belongs to i8042 controller. - */ -bool i8042_check_port_owner(const struct serio *port) -{ - int i; - - for (i = 0; i < I8042_NUM_PORTS; i++) - if (i8042_ports[i].serio == port) - return true; - - return false; -} -EXPORT_SYMBOL(i8042_check_port_owner); - static void i8042_free_irqs(void) { if (i8042_aux_irq_registered) diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c index 316f2c897101..83e9c663aa67 100644 --- a/drivers/input/serio/libps2.c +++ b/drivers/input/serio/libps2.c @@ -56,19 +56,17 @@ EXPORT_SYMBOL(ps2_sendbyte); void ps2_begin_command(struct ps2dev *ps2dev) { - mutex_lock(&ps2dev->cmd_mutex); + struct mutex *m = ps2dev->serio->ps2_cmd_mutex ?: &ps2dev->cmd_mutex; - if (i8042_check_port_owner(ps2dev->serio)) - i8042_lock_chip(); + mutex_lock(m); } EXPORT_SYMBOL(ps2_begin_command); void ps2_end_command(struct ps2dev *ps2dev) { - if (i8042_check_port_owner(ps2dev->serio)) - i8042_unlock_chip(); + struct mutex *m = ps2dev->serio->ps2_cmd_mutex ?: &ps2dev->cmd_mutex; - mutex_unlock(&ps2dev->cmd_mutex); + mutex_unlock(m); } EXPORT_SYMBOL(ps2_end_command); diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig index ee02dc7422bd..2fb1f430a431 100644 --- a/drivers/input/touchscreen/Kconfig +++ b/drivers/input/touchscreen/Kconfig @@ -1059,6 +1059,31 @@ config TOUCHSCREEN_RM_TS To compile this driver as a module, choose M here: the module will be called raydium_i2c_ts. +config TOUCHSCREEN_SILEAD + tristate "Silead I2C touchscreen" + depends on I2C + help + Say Y here if you have the Silead touchscreen connected to + your system. + + If unsure, say N. + + To compile this driver as a module, choose M here: the + module will be called silead. + +config TOUCHSCREEN_SIS_I2C + tristate "SiS 9200 family I2C touchscreen" + depends on I2C + select CRC_ITU_T + depends on GPIOLIB || COMPILE_TEST + help + This enables support for SiS 9200 family over I2C based touchscreens. + + If unsure, say N. + + To compile this driver as a module, choose M here: the + module will be called sis_i2c. + config TOUCHSCREEN_ST1232 tristate "Sitronix ST1232 touchscreen controllers" depends on I2C diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile index 3315882905f7..b4373d6be402 100644 --- a/drivers/input/touchscreen/Makefile +++ b/drivers/input/touchscreen/Makefile @@ -64,6 +64,8 @@ obj-$(CONFIG_TOUCHSCREEN_PENMOUNT) += penmount.o obj-$(CONFIG_TOUCHSCREEN_PIXCIR) += pixcir_i2c_ts.o obj-$(CONFIG_TOUCHSCREEN_RM_TS) += raydium_i2c_ts.o obj-$(CONFIG_TOUCHSCREEN_S3C2410) += s3c2410_ts.o +obj-$(CONFIG_TOUCHSCREEN_SILEAD) += silead.o +obj-$(CONFIG_TOUCHSCREEN_SIS_I2C) += sis_i2c.o obj-$(CONFIG_TOUCHSCREEN_ST1232) += st1232.o obj-$(CONFIG_TOUCHSCREEN_STMPE) += stmpe-ts.o obj-$(CONFIG_TOUCHSCREEN_SUN4I) += sun4i-ts.o diff --git a/drivers/input/touchscreen/ili210x.c b/drivers/input/touchscreen/ili210x.c index ddf694b9fffc..fe4848bd1f4c 100644 --- a/drivers/input/touchscreen/ili210x.c +++ b/drivers/input/touchscreen/ili210x.c @@ -169,7 +169,7 @@ static ssize_t ili210x_calibrate(struct device *dev, return count; } -static DEVICE_ATTR(calibrate, 0644, NULL, ili210x_calibrate); +static DEVICE_ATTR(calibrate, S_IWUSR, NULL, ili210x_calibrate); static struct attribute *ili210x_attributes[] = { &dev_attr_calibrate.attr, diff --git a/drivers/input/touchscreen/silead.c b/drivers/input/touchscreen/silead.c new file mode 100644 index 000000000000..7379fe153cf9 --- /dev/null +++ b/drivers/input/touchscreen/silead.c @@ -0,0 +1,565 @@ +/* ------------------------------------------------------------------------- + * Copyright (C) 2014-2015, Intel Corporation + * + * Derived from: + * gslX68X.c + * Copyright (C) 2010-2015, Shanghai Sileadinc Co.Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * ------------------------------------------------------------------------- + */ + +#include <linux/i2c.h> +#include <linux/module.h> +#include <linux/acpi.h> +#include <linux/interrupt.h> +#include <linux/gpio/consumer.h> +#include <linux/delay.h> +#include <linux/firmware.h> +#include <linux/input.h> +#include <linux/input/mt.h> +#include <linux/input/touchscreen.h> +#include <linux/pm.h> +#include <linux/irq.h> + +#include <asm/unaligned.h> + +#define SILEAD_TS_NAME "silead_ts" + +#define SILEAD_REG_RESET 0xE0 +#define SILEAD_REG_DATA 0x80 +#define SILEAD_REG_TOUCH_NR 0x80 +#define SILEAD_REG_POWER 0xBC +#define SILEAD_REG_CLOCK 0xE4 +#define SILEAD_REG_STATUS 0xB0 +#define SILEAD_REG_ID 0xFC +#define SILEAD_REG_MEM_CHECK 0xB0 + +#define SILEAD_STATUS_OK 0x5A5A5A5A +#define SILEAD_TS_DATA_LEN 44 +#define SILEAD_CLOCK 0x04 + +#define SILEAD_CMD_RESET 0x88 +#define SILEAD_CMD_START 0x00 + +#define SILEAD_POINT_DATA_LEN 0x04 +#define SILEAD_POINT_Y_OFF 0x00 +#define SILEAD_POINT_Y_MSB_OFF 0x01 +#define SILEAD_POINT_X_OFF 0x02 +#define SILEAD_POINT_X_MSB_OFF 0x03 +#define SILEAD_TOUCH_ID_MASK 0xF0 + +#define SILEAD_CMD_SLEEP_MIN 10000 +#define SILEAD_CMD_SLEEP_MAX 20000 +#define SILEAD_POWER_SLEEP 20 +#define SILEAD_STARTUP_SLEEP 30 + +#define SILEAD_MAX_FINGERS 10 + +enum silead_ts_power { + SILEAD_POWER_ON = 1, + SILEAD_POWER_OFF = 0 +}; + +struct silead_ts_data { + struct i2c_client *client; + struct gpio_desc *gpio_power; + struct input_dev *input; + char fw_name[64]; + struct touchscreen_properties prop; + u32 max_fingers; + u32 chip_id; + struct input_mt_pos pos[SILEAD_MAX_FINGERS]; + int slots[SILEAD_MAX_FINGERS]; + int id[SILEAD_MAX_FINGERS]; +}; + +struct silead_fw_data { + u32 offset; + u32 val; +}; + +static int silead_ts_request_input_dev(struct silead_ts_data *data) +{ + struct device *dev = &data->client->dev; + int error; + + data->input = devm_input_allocate_device(dev); + if (!data->input) { + dev_err(dev, + "Failed to allocate input device\n"); + return -ENOMEM; + } + + input_set_abs_params(data->input, ABS_MT_POSITION_X, 0, 4095, 0, 0); + input_set_abs_params(data->input, ABS_MT_POSITION_Y, 0, 4095, 0, 0); + touchscreen_parse_properties(data->input, true, &data->prop); + + input_mt_init_slots(data->input, data->max_fingers, + INPUT_MT_DIRECT | INPUT_MT_DROP_UNUSED | + INPUT_MT_TRACK); + + data->input->name = SILEAD_TS_NAME; + data->input->phys = "input/ts"; + data->input->id.bustype = BUS_I2C; + + error = input_register_device(data->input); + if (error) { + dev_err(dev, "Failed to register input device: %d\n", error); + return error; + } + + return 0; +} + +static void silead_ts_set_power(struct i2c_client *client, + enum silead_ts_power state) +{ + struct silead_ts_data *data = i2c_get_clientdata(client); + + if (data->gpio_power) { + gpiod_set_value_cansleep(data->gpio_power, state); + msleep(SILEAD_POWER_SLEEP); + } +} + +static void silead_ts_read_data(struct i2c_client *client) +{ + struct silead_ts_data *data = i2c_get_clientdata(client); + struct input_dev *input = data->input; + struct device *dev = &client->dev; + u8 *bufp, buf[SILEAD_TS_DATA_LEN]; + int touch_nr, error, i; + + error = i2c_smbus_read_i2c_block_data(client, SILEAD_REG_DATA, + SILEAD_TS_DATA_LEN, buf); + if (error < 0) { + dev_err(dev, "Data read error %d\n", error); + return; + } + + touch_nr = buf[0]; + if (touch_nr > data->max_fingers) { + dev_warn(dev, "More touches reported then supported %d > %d\n", + touch_nr, data->max_fingers); + touch_nr = data->max_fingers; + } + + bufp = buf + SILEAD_POINT_DATA_LEN; + for (i = 0; i < touch_nr; i++, bufp += SILEAD_POINT_DATA_LEN) { + /* Bits 4-7 are the touch id */ + data->id[i] = (bufp[SILEAD_POINT_X_MSB_OFF] & + SILEAD_TOUCH_ID_MASK) >> 4; + touchscreen_set_mt_pos(&data->pos[i], &data->prop, + get_unaligned_le16(&bufp[SILEAD_POINT_X_OFF]) & 0xfff, + get_unaligned_le16(&bufp[SILEAD_POINT_Y_OFF]) & 0xfff); + } + + input_mt_assign_slots(input, data->slots, data->pos, touch_nr, 0); + + for (i = 0; i < touch_nr; i++) { + input_mt_slot(input, data->slots[i]); + input_mt_report_slot_state(input, MT_TOOL_FINGER, true); + input_report_abs(input, ABS_MT_POSITION_X, data->pos[i].x); + input_report_abs(input, ABS_MT_POSITION_Y, data->pos[i].y); + + dev_dbg(dev, "x=%d y=%d hw_id=%d sw_id=%d\n", data->pos[i].x, + data->pos[i].y, data->id[i], data->slots[i]); + } + + input_mt_sync_frame(input); + input_sync(input); +} + +static int silead_ts_init(struct i2c_client *client) +{ + struct silead_ts_data *data = i2c_get_clientdata(client); + int error; + + error = i2c_smbus_write_byte_data(client, SILEAD_REG_RESET, + SILEAD_CMD_RESET); + if (error) + goto i2c_write_err; + usleep_range(SILEAD_CMD_SLEEP_MIN, SILEAD_CMD_SLEEP_MAX); + + error = i2c_smbus_write_byte_data(client, SILEAD_REG_TOUCH_NR, + data->max_fingers); + if (error) + goto i2c_write_err; + usleep_range(SILEAD_CMD_SLEEP_MIN, SILEAD_CMD_SLEEP_MAX); + + error = i2c_smbus_write_byte_data(client, SILEAD_REG_CLOCK, + SILEAD_CLOCK); + if (error) + goto i2c_write_err; + usleep_range(SILEAD_CMD_SLEEP_MIN, SILEAD_CMD_SLEEP_MAX); + + error = i2c_smbus_write_byte_data(client, SILEAD_REG_RESET, + SILEAD_CMD_START); + if (error) + goto i2c_write_err; + usleep_range(SILEAD_CMD_SLEEP_MIN, SILEAD_CMD_SLEEP_MAX); + + return 0; + +i2c_write_err: + dev_err(&client->dev, "Registers clear error %d\n", error); + return error; +} + +static int silead_ts_reset(struct i2c_client *client) +{ + int error; + + error = i2c_smbus_write_byte_data(client, SILEAD_REG_RESET, + SILEAD_CMD_RESET); + if (error) + goto i2c_write_err; + usleep_range(SILEAD_CMD_SLEEP_MIN, SILEAD_CMD_SLEEP_MAX); + + error = i2c_smbus_write_byte_data(client, SILEAD_REG_CLOCK, + SILEAD_CLOCK); + if (error) + goto i2c_write_err; + usleep_range(SILEAD_CMD_SLEEP_MIN, SILEAD_CMD_SLEEP_MAX); + + error = i2c_smbus_write_byte_data(client, SILEAD_REG_POWER, + SILEAD_CMD_START); + if (error) + goto i2c_write_err; + usleep_range(SILEAD_CMD_SLEEP_MIN, SILEAD_CMD_SLEEP_MAX); + + return 0; + +i2c_write_err: + dev_err(&client->dev, "Chip reset error %d\n", error); + return error; +} + +static int silead_ts_startup(struct i2c_client *client) +{ + int error; + + error = i2c_smbus_write_byte_data(client, SILEAD_REG_RESET, 0x00); + if (error) { + dev_err(&client->dev, "Startup error %d\n", error); + return error; + } + + msleep(SILEAD_STARTUP_SLEEP); + + return 0; +} + +static int silead_ts_load_fw(struct i2c_client *client) +{ + struct device *dev = &client->dev; + struct silead_ts_data *data = i2c_get_clientdata(client); + unsigned int fw_size, i; + const struct firmware *fw; + struct silead_fw_data *fw_data; + int error; + + dev_dbg(dev, "Firmware file name: %s", data->fw_name); + + error = request_firmware(&fw, data->fw_name, dev); + if (error) { + dev_err(dev, "Firmware request error %d\n", error); + return error; + } + + fw_size = fw->size / sizeof(*fw_data); + fw_data = (struct silead_fw_data *)fw->data; + + for (i = 0; i < fw_size; i++) { + error = i2c_smbus_write_i2c_block_data(client, + fw_data[i].offset, + 4, + (u8 *)&fw_data[i].val); + if (error) { + dev_err(dev, "Firmware load error %d\n", error); + break; + } + } + + release_firmware(fw); + return error ?: 0; +} + +static u32 silead_ts_get_status(struct i2c_client *client) +{ + int error; + __le32 status; + + error = i2c_smbus_read_i2c_block_data(client, SILEAD_REG_STATUS, + sizeof(status), (u8 *)&status); + if (error < 0) { + dev_err(&client->dev, "Status read error %d\n", error); + return error; + } + + return le32_to_cpu(status); +} + +static int silead_ts_get_id(struct i2c_client *client) +{ + struct silead_ts_data *data = i2c_get_clientdata(client); + __le32 chip_id; + int error; + + error = i2c_smbus_read_i2c_block_data(client, SILEAD_REG_ID, + sizeof(chip_id), (u8 *)&chip_id); + if (error < 0) { + dev_err(&client->dev, "Chip ID read error %d\n", error); + return error; + } + + data->chip_id = le32_to_cpu(chip_id); + dev_info(&client->dev, "Silead chip ID: 0x%8X", data->chip_id); + + return 0; +} + +static int silead_ts_setup(struct i2c_client *client) +{ + int error; + u32 status; + + silead_ts_set_power(client, SILEAD_POWER_OFF); + silead_ts_set_power(client, SILEAD_POWER_ON); + + error = silead_ts_get_id(client); + if (error) + return error; + + error = silead_ts_init(client); + if (error) + return error; + + error = silead_ts_reset(client); + if (error) + return error; + + error = silead_ts_load_fw(client); + if (error) + return error; + + error = silead_ts_startup(client); + if (error) + return error; + + status = silead_ts_get_status(client); + if (status != SILEAD_STATUS_OK) { + dev_err(&client->dev, + "Initialization error, status: 0x%X\n", status); + return -ENODEV; + } + + return 0; +} + +static irqreturn_t silead_ts_threaded_irq_handler(int irq, void *id) +{ + struct silead_ts_data *data = id; + struct i2c_client *client = data->client; + + silead_ts_read_data(client); + + return IRQ_HANDLED; +} + +static void silead_ts_read_props(struct i2c_client *client) +{ + struct silead_ts_data *data = i2c_get_clientdata(client); + struct device *dev = &client->dev; + const char *str; + int error; + + error = device_property_read_u32(dev, "silead,max-fingers", + &data->max_fingers); + if (error) { + dev_dbg(dev, "Max fingers read error %d\n", error); + data->max_fingers = 5; /* Most devices handle up-to 5 fingers */ + } + + error = device_property_read_string(dev, "touchscreen-fw-name", &str); + if (!error) + snprintf(data->fw_name, sizeof(data->fw_name), "%s", str); + else + dev_dbg(dev, "Firmware file name read error. Using default."); +} + +#ifdef CONFIG_ACPI +static int silead_ts_set_default_fw_name(struct silead_ts_data *data, + const struct i2c_device_id *id) +{ + const struct acpi_device_id *acpi_id; + struct device *dev = &data->client->dev; + int i; + + if (ACPI_HANDLE(dev)) { + acpi_id = acpi_match_device(dev->driver->acpi_match_table, dev); + if (!acpi_id) + return -ENODEV; + + snprintf(data->fw_name, sizeof(data->fw_name), "%s.fw", + acpi_id->id); + + for (i = 0; i < strlen(data->fw_name); i++) + data->fw_name[i] = tolower(data->fw_name[i]); + } else { + snprintf(data->fw_name, sizeof(data->fw_name), "%s.fw", + id->name); + } + + return 0; +} +#else +static int silead_ts_set_default_fw_name(struct silead_ts_data *data, + const struct i2c_device_id *id) +{ + snprintf(data->fw_name, sizeof(data->fw_name), "%s.fw", id->name); + return 0; +} +#endif + +static int silead_ts_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct silead_ts_data *data; + struct device *dev = &client->dev; + int error; + + if (!i2c_check_functionality(client->adapter, + I2C_FUNC_I2C | + I2C_FUNC_SMBUS_READ_I2C_BLOCK | + I2C_FUNC_SMBUS_WRITE_I2C_BLOCK)) { + dev_err(dev, "I2C functionality check failed\n"); + return -ENXIO; + } + + data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + i2c_set_clientdata(client, data); + data->client = client; + + error = silead_ts_set_default_fw_name(data, id); + if (error) + return error; + + silead_ts_read_props(client); + + /* We must have the IRQ provided by DT or ACPI subsytem */ + if (client->irq <= 0) + return -ENODEV; + + /* Power GPIO pin */ + data->gpio_power = gpiod_get_optional(dev, "power", GPIOD_OUT_LOW); + if (IS_ERR(data->gpio_power)) { + if (PTR_ERR(data->gpio_power) != -EPROBE_DEFER) + dev_err(dev, "Shutdown GPIO request failed\n"); + return PTR_ERR(data->gpio_power); + } + + error = silead_ts_setup(client); + if (error) + return error; + + error = silead_ts_request_input_dev(data); + if (error) + return error; + + error = devm_request_threaded_irq(dev, client->irq, + NULL, silead_ts_threaded_irq_handler, + IRQF_ONESHOT, client->name, data); + if (error) { + if (error != -EPROBE_DEFER) + dev_err(dev, "IRQ request failed %d\n", error); + return error; + } + + return 0; +} + +static int __maybe_unused silead_ts_suspend(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + + silead_ts_set_power(client, SILEAD_POWER_OFF); + return 0; +} + +static int __maybe_unused silead_ts_resume(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + int error, status; + + silead_ts_set_power(client, SILEAD_POWER_ON); + + error = silead_ts_reset(client); + if (error) + return error; + + error = silead_ts_startup(client); + if (error) + return error; + + status = silead_ts_get_status(client); + if (status != SILEAD_STATUS_OK) { + dev_err(dev, "Resume error, status: 0x%02x\n", status); + return -ENODEV; + } + + return 0; +} + +static SIMPLE_DEV_PM_OPS(silead_ts_pm, silead_ts_suspend, silead_ts_resume); + +static const struct i2c_device_id silead_ts_id[] = { + { "gsl1680", 0 }, + { "gsl1688", 0 }, + { "gsl3670", 0 }, + { "gsl3675", 0 }, + { "gsl3692", 0 }, + { "mssl1680", 0 }, + { } +}; +MODULE_DEVICE_TABLE(i2c, silead_ts_id); + +#ifdef CONFIG_ACPI +static const struct acpi_device_id silead_ts_acpi_match[] = { + { "GSL1680", 0 }, + { "GSL1688", 0 }, + { "GSL3670", 0 }, + { "GSL3675", 0 }, + { "GSL3692", 0 }, + { "MSSL1680", 0 }, + { } +}; +MODULE_DEVICE_TABLE(acpi, silead_ts_acpi_match); +#endif + +static struct i2c_driver silead_ts_driver = { + .probe = silead_ts_probe, + .id_table = silead_ts_id, + .driver = { + .name = SILEAD_TS_NAME, + .acpi_match_table = ACPI_PTR(silead_ts_acpi_match), + .pm = &silead_ts_pm, + }, +}; +module_i2c_driver(silead_ts_driver); + +MODULE_AUTHOR("Robert Dolca <robert.dolca@intel.com>"); +MODULE_DESCRIPTION("Silead I2C touchscreen driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/input/touchscreen/sis_i2c.c b/drivers/input/touchscreen/sis_i2c.c new file mode 100644 index 000000000000..8d93f8c9a403 --- /dev/null +++ b/drivers/input/touchscreen/sis_i2c.c @@ -0,0 +1,413 @@ +/* + * Touch Screen driver for SiS 9200 family I2C Touch panels + * + * Copyright (C) 2015 SiS, Inc. + * Copyright (C) 2016 Nextfour Group + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/crc-itu-t.h> +#include <linux/delay.h> +#include <linux/i2c.h> +#include <linux/input.h> +#include <linux/input/mt.h> +#include <linux/interrupt.h> +#include <linux/gpio/consumer.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <asm/unaligned.h> + +#define SIS_I2C_NAME "sis_i2c_ts" + +/* + * The I2C packet format: + * le16 byte count + * u8 Report ID + * <contact data - variable length> + * u8 Number of contacts + * le16 Scan Time (optional) + * le16 CRC + * + * One touch point information consists of 6+ bytes, the order is: + * u8 contact state + * u8 finger id + * le16 x axis + * le16 y axis + * u8 contact width (optional) + * u8 contact height (optional) + * u8 pressure (optional) + * + * Maximum amount of data transmitted in one shot is 64 bytes, if controller + * needs to report more contacts than fit in one packet it will send true + * number of contacts in first packet and 0 as number of contacts in second + * packet. + */ + +#define SIS_MAX_PACKET_SIZE 64 + +#define SIS_PKT_LEN_OFFSET 0 +#define SIS_PKT_REPORT_OFFSET 2 /* Report ID/type */ +#define SIS_PKT_CONTACT_OFFSET 3 /* First contact */ + +#define SIS_SCAN_TIME_LEN 2 + +/* Supported report types */ +#define SIS_ALL_IN_ONE_PACKAGE 0x10 +#define SIS_PKT_IS_TOUCH(x) (((x) & 0x0f) == 0x01) +#define SIS_PKT_IS_HIDI2C(x) (((x) & 0x0f) == 0x06) + +/* Contact properties within report */ +#define SIS_PKT_HAS_AREA(x) ((x) & BIT(4)) +#define SIS_PKT_HAS_PRESSURE(x) ((x) & BIT(5)) +#define SIS_PKT_HAS_SCANTIME(x) ((x) & BIT(6)) + +/* Contact size */ +#define SIS_BASE_LEN_PER_CONTACT 6 +#define SIS_AREA_LEN_PER_CONTACT 2 +#define SIS_PRESSURE_LEN_PER_CONTACT 1 + +/* Offsets within contact data */ +#define SIS_CONTACT_STATUS_OFFSET 0 +#define SIS_CONTACT_ID_OFFSET 1 /* Contact ID */ +#define SIS_CONTACT_X_OFFSET 2 +#define SIS_CONTACT_Y_OFFSET 4 +#define SIS_CONTACT_WIDTH_OFFSET 6 +#define SIS_CONTACT_HEIGHT_OFFSET 7 +#define SIS_CONTACT_PRESSURE_OFFSET(id) (SIS_PKT_HAS_AREA(id) ? 8 : 6) + +/* Individual contact state */ +#define SIS_STATUS_UP 0x0 +#define SIS_STATUS_DOWN 0x3 + +/* Touchscreen parameters */ +#define SIS_MAX_FINGERS 10 +#define SIS_MAX_X 4095 +#define SIS_MAX_Y 4095 +#define SIS_MAX_PRESSURE 255 + +/* Resolution diagonal */ +#define SIS_AREA_LENGTH_LONGER 5792 +/*((SIS_MAX_X^2) + (SIS_MAX_Y^2))^0.5*/ +#define SIS_AREA_LENGTH_SHORT 5792 +#define SIS_AREA_UNIT (5792 / 32) + +struct sis_ts_data { + struct i2c_client *client; + struct input_dev *input; + + struct gpio_desc *attn_gpio; + struct gpio_desc *reset_gpio; + + u8 packet[SIS_MAX_PACKET_SIZE]; +}; + +static int sis_read_packet(struct i2c_client *client, u8 *buf, + unsigned int *num_contacts, + unsigned int *contact_size) +{ + int count_idx; + int ret; + u16 len; + u16 crc, pkg_crc; + u8 report_id; + + ret = i2c_master_recv(client, buf, SIS_MAX_PACKET_SIZE); + if (ret <= 0) + return -EIO; + + len = get_unaligned_le16(&buf[SIS_PKT_LEN_OFFSET]); + if (len > SIS_MAX_PACKET_SIZE) { + dev_err(&client->dev, + "%s: invalid packet length (%d vs %d)\n", + __func__, len, SIS_MAX_PACKET_SIZE); + return -E2BIG; + } + + if (len < 10) + return -EINVAL; + + report_id = buf[SIS_PKT_REPORT_OFFSET]; + count_idx = len - 1; + *contact_size = SIS_BASE_LEN_PER_CONTACT; + + if (report_id != SIS_ALL_IN_ONE_PACKAGE) { + if (SIS_PKT_IS_TOUCH(report_id)) { + /* + * Calculate CRC ignoring packet length + * in the beginning and CRC transmitted + * at the end of the packet. + */ + crc = crc_itu_t(0, buf + 2, len - 2 - 2); + pkg_crc = get_unaligned_le16(&buf[len - 2]); + + if (crc != pkg_crc) { + dev_err(&client->dev, + "%s: CRC Error (%d vs %d)\n", + __func__, crc, pkg_crc); + return -EINVAL; + } + + count_idx -= 2; + + } else if (!SIS_PKT_IS_HIDI2C(report_id)) { + dev_err(&client->dev, + "%s: invalid packet ID %#02x\n", + __func__, report_id); + return -EINVAL; + } + + if (SIS_PKT_HAS_SCANTIME(report_id)) + count_idx -= SIS_SCAN_TIME_LEN; + + if (SIS_PKT_HAS_AREA(report_id)) + *contact_size += SIS_AREA_LEN_PER_CONTACT; + if (SIS_PKT_HAS_PRESSURE(report_id)) + *contact_size += SIS_PRESSURE_LEN_PER_CONTACT; + } + + *num_contacts = buf[count_idx]; + return 0; +} + +static int sis_ts_report_contact(struct sis_ts_data *ts, const u8 *data, u8 id) +{ + struct input_dev *input = ts->input; + int slot; + u8 status = data[SIS_CONTACT_STATUS_OFFSET]; + u8 pressure; + u8 height, width; + u16 x, y; + + if (status != SIS_STATUS_DOWN && status != SIS_STATUS_UP) { + dev_err(&ts->client->dev, "Unexpected touch status: %#02x\n", + data[SIS_CONTACT_STATUS_OFFSET]); + return -EINVAL; + } + + slot = input_mt_get_slot_by_key(input, data[SIS_CONTACT_ID_OFFSET]); + if (slot < 0) + return -ENOENT; + + input_mt_slot(input, slot); + input_mt_report_slot_state(input, MT_TOOL_FINGER, + status == SIS_STATUS_DOWN); + + if (status == SIS_STATUS_DOWN) { + pressure = height = width = 1; + if (id != SIS_ALL_IN_ONE_PACKAGE) { + if (SIS_PKT_HAS_AREA(id)) { + width = data[SIS_CONTACT_WIDTH_OFFSET]; + height = data[SIS_CONTACT_HEIGHT_OFFSET]; + } + + if (SIS_PKT_HAS_PRESSURE(id)) + pressure = + data[SIS_CONTACT_PRESSURE_OFFSET(id)]; + } + + x = get_unaligned_le16(&data[SIS_CONTACT_X_OFFSET]); + y = get_unaligned_le16(&data[SIS_CONTACT_Y_OFFSET]); + + input_report_abs(input, ABS_MT_TOUCH_MAJOR, + width * SIS_AREA_UNIT); + input_report_abs(input, ABS_MT_TOUCH_MINOR, + height * SIS_AREA_UNIT); + input_report_abs(input, ABS_MT_PRESSURE, pressure); + input_report_abs(input, ABS_MT_POSITION_X, x); + input_report_abs(input, ABS_MT_POSITION_Y, y); + } + + return 0; +} + +static void sis_ts_handle_packet(struct sis_ts_data *ts) +{ + const u8 *contact; + unsigned int num_to_report = 0; + unsigned int num_contacts; + unsigned int num_reported; + unsigned int contact_size; + int error; + u8 report_id; + + do { + error = sis_read_packet(ts->client, ts->packet, + &num_contacts, &contact_size); + if (error) + break; + + if (num_to_report == 0) { + num_to_report = num_contacts; + } else if (num_contacts != 0) { + dev_err(&ts->client->dev, + "%s: nonzero (%d) point count in tail packet\n", + __func__, num_contacts); + break; + } + + report_id = ts->packet[SIS_PKT_REPORT_OFFSET]; + contact = &ts->packet[SIS_PKT_CONTACT_OFFSET]; + num_reported = 0; + + while (num_to_report > 0) { + error = sis_ts_report_contact(ts, contact, report_id); + if (error) + break; + + contact += contact_size; + num_to_report--; + num_reported++; + + if (report_id != SIS_ALL_IN_ONE_PACKAGE && + num_reported >= 5) { + /* + * The remainder of contacts is sent + * in the 2nd packet. + */ + break; + } + } + } while (num_to_report > 0); + + input_mt_sync_frame(ts->input); + input_sync(ts->input); +} + +static irqreturn_t sis_ts_irq_handler(int irq, void *dev_id) +{ + struct sis_ts_data *ts = dev_id; + + do { + sis_ts_handle_packet(ts); + } while (ts->attn_gpio && gpiod_get_value_cansleep(ts->attn_gpio)); + + return IRQ_HANDLED; +} + +static void sis_ts_reset(struct sis_ts_data *ts) +{ + if (ts->reset_gpio) { + /* Get out of reset */ + usleep_range(1000, 2000); + gpiod_set_value(ts->reset_gpio, 1); + usleep_range(1000, 2000); + gpiod_set_value(ts->reset_gpio, 0); + msleep(100); + } +} + +static int sis_ts_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct sis_ts_data *ts; + struct input_dev *input; + int error; + + ts = devm_kzalloc(&client->dev, sizeof(*ts), GFP_KERNEL); + if (!ts) + return -ENOMEM; + + ts->client = client; + i2c_set_clientdata(client, ts); + + ts->attn_gpio = devm_gpiod_get_optional(&client->dev, + "attn", GPIOD_IN); + if (IS_ERR(ts->attn_gpio)) { + error = PTR_ERR(ts->attn_gpio); + if (error != -EPROBE_DEFER) + dev_err(&client->dev, + "Failed to get attention GPIO: %d\n", error); + return error; + } + + ts->reset_gpio = devm_gpiod_get_optional(&client->dev, + "reset", GPIOD_OUT_LOW); + if (IS_ERR(ts->reset_gpio)) { + error = PTR_ERR(ts->reset_gpio); + if (error != -EPROBE_DEFER) + dev_err(&client->dev, + "Failed to get reset GPIO: %d\n", error); + return error; + } + + sis_ts_reset(ts); + + ts->input = input = devm_input_allocate_device(&client->dev); + if (!input) { + dev_err(&client->dev, "Failed to allocate input device\n"); + return -ENOMEM; + } + + input->name = "SiS Touchscreen"; + input->id.bustype = BUS_I2C; + + input_set_abs_params(input, ABS_MT_POSITION_X, 0, SIS_MAX_X, 0, 0); + input_set_abs_params(input, ABS_MT_POSITION_Y, 0, SIS_MAX_Y, 0, 0); + input_set_abs_params(input, ABS_MT_PRESSURE, 0, SIS_MAX_PRESSURE, 0, 0); + input_set_abs_params(input, ABS_MT_TOUCH_MAJOR, + 0, SIS_AREA_LENGTH_LONGER, 0, 0); + input_set_abs_params(input, ABS_MT_TOUCH_MINOR, + 0, SIS_AREA_LENGTH_SHORT, 0, 0); + + error = input_mt_init_slots(input, SIS_MAX_FINGERS, INPUT_MT_DIRECT); + if (error) { + dev_err(&client->dev, + "Failed to initialize MT slots: %d\n", error); + return error; + } + + error = devm_request_threaded_irq(&client->dev, client->irq, + NULL, sis_ts_irq_handler, + IRQF_ONESHOT, + client->name, ts); + if (error) { + dev_err(&client->dev, "Failed to request IRQ: %d\n", error); + return error; + } + + error = input_register_device(ts->input); + if (error) { + dev_err(&client->dev, + "Failed to register input device: %d\n", error); + return error; + } + + return 0; +} + +#ifdef CONFIG_OF +static const struct of_device_id sis_ts_dt_ids[] = { + { .compatible = "sis,9200-ts" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, sis_ts_dt_ids); +#endif + +static const struct i2c_device_id sis_ts_id[] = { + { SIS_I2C_NAME, 0 }, + { "9200-ts", 0 }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(i2c, sis_ts_id); + +static struct i2c_driver sis_ts_driver = { + .driver = { + .name = SIS_I2C_NAME, + .of_match_table = of_match_ptr(sis_ts_dt_ids), + }, + .probe = sis_ts_probe, + .id_table = sis_ts_id, +}; +module_i2c_driver(sis_ts_driver); + +MODULE_DESCRIPTION("SiS 9200 Family Touchscreen Driver"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Mika Penttilä <mika.penttila@nextfour.com>"); diff --git a/drivers/mtd/ubi/attach.c b/drivers/mtd/ubi/attach.c index c1aaf0336cf2..903becd31410 100644 --- a/drivers/mtd/ubi/attach.c +++ b/drivers/mtd/ubi/attach.c @@ -175,6 +175,40 @@ static int add_corrupted(struct ubi_attach_info *ai, int pnum, int ec) } /** + * add_fastmap - add a Fastmap related physical eraseblock. + * @ai: attaching information + * @pnum: physical eraseblock number the VID header came from + * @vid_hdr: the volume identifier header + * @ec: erase counter of the physical eraseblock + * + * This function allocates a 'struct ubi_ainf_peb' object for a Fastamp + * physical eraseblock @pnum and adds it to the 'fastmap' list. + * Such blocks can be Fastmap super and data blocks from both the most + * recent Fastmap we're attaching from or from old Fastmaps which will + * be erased. + */ +static int add_fastmap(struct ubi_attach_info *ai, int pnum, + struct ubi_vid_hdr *vid_hdr, int ec) +{ + struct ubi_ainf_peb *aeb; + + aeb = kmem_cache_alloc(ai->aeb_slab_cache, GFP_KERNEL); + if (!aeb) + return -ENOMEM; + + aeb->pnum = pnum; + aeb->vol_id = be32_to_cpu(vidh->vol_id); + aeb->sqnum = be64_to_cpu(vidh->sqnum); + aeb->ec = ec; + list_add(&aeb->u.list, &ai->fastmap); + + dbg_bld("add to fastmap list: PEB %d, vol_id %d, sqnum: %llu", pnum, + aeb->vol_id, aeb->sqnum); + + return 0; +} + +/** * validate_vid_hdr - check volume identifier header. * @ubi: UBI device description object * @vid_hdr: the volume identifier header to check @@ -803,13 +837,26 @@ out_unlock: return err; } +static bool vol_ignored(int vol_id) +{ + switch (vol_id) { + case UBI_LAYOUT_VOLUME_ID: + return true; + } + +#ifdef CONFIG_MTD_UBI_FASTMAP + return ubi_is_fm_vol(vol_id); +#else + return false; +#endif +} + /** * scan_peb - scan and process UBI headers of a PEB. * @ubi: UBI device description object * @ai: attaching information * @pnum: the physical eraseblock number - * @vid: The volume ID of the found volume will be stored in this pointer - * @sqnum: The sqnum of the found volume will be stored in this pointer + * @fast: true if we're scanning for a Fastmap * * This function reads UBI headers of PEB @pnum, checks them, and adds * information about this PEB to the corresponding list or RB-tree in the @@ -817,9 +864,9 @@ out_unlock: * successfully handled and a negative error code in case of failure. */ static int scan_peb(struct ubi_device *ubi, struct ubi_attach_info *ai, - int pnum, int *vid, unsigned long long *sqnum) + int pnum, bool fast) { - long long uninitialized_var(ec); + long long ec; int err, bitflips = 0, vol_id = -1, ec_err = 0; dbg_bld("scan PEB %d", pnum); @@ -935,6 +982,20 @@ static int scan_peb(struct ubi_device *ubi, struct ubi_attach_info *ai, */ ai->maybe_bad_peb_count += 1; case UBI_IO_BAD_HDR: + /* + * If we're facing a bad VID header we have to drop *all* + * Fastmap data structures we find. The most recent Fastmap + * could be bad and therefore there is a chance that we attach + * from an old one. On a fine MTD stack a PEB must not render + * bad all of a sudden, but the reality is different. + * So, let's be paranoid and help finding the root cause by + * falling back to scanning mode instead of attaching with a + * bad EBA table and cause data corruption which is hard to + * analyze. + */ + if (fast) + ai->force_full_scan = 1; + if (ec_err) /* * Both headers are corrupted. There is a possibility @@ -991,21 +1052,15 @@ static int scan_peb(struct ubi_device *ubi, struct ubi_attach_info *ai, } vol_id = be32_to_cpu(vidh->vol_id); - if (vid) - *vid = vol_id; - if (sqnum) - *sqnum = be64_to_cpu(vidh->sqnum); - if (vol_id > UBI_MAX_VOLUMES && vol_id != UBI_LAYOUT_VOLUME_ID) { + if (vol_id > UBI_MAX_VOLUMES && !vol_ignored(vol_id)) { int lnum = be32_to_cpu(vidh->lnum); /* Unsupported internal volume */ switch (vidh->compat) { case UBI_COMPAT_DELETE: - if (vol_id != UBI_FM_SB_VOLUME_ID - && vol_id != UBI_FM_DATA_VOLUME_ID) { - ubi_msg(ubi, "\"delete\" compatible internal volume %d:%d found, will remove it", - vol_id, lnum); - } + ubi_msg(ubi, "\"delete\" compatible internal volume %d:%d found, will remove it", + vol_id, lnum); + err = add_to_list(ai, pnum, vol_id, lnum, ec, 1, &ai->erase); if (err) @@ -1037,7 +1092,12 @@ static int scan_peb(struct ubi_device *ubi, struct ubi_attach_info *ai, if (ec_err) ubi_warn(ubi, "valid VID header but corrupted EC header at PEB %d", pnum); - err = ubi_add_to_av(ubi, ai, pnum, ec, vidh, bitflips); + + if (ubi_is_fm_vol(vol_id)) + err = add_fastmap(ai, pnum, vidh, ec); + else + err = ubi_add_to_av(ubi, ai, pnum, ec, vidh, bitflips); + if (err) return err; @@ -1186,6 +1246,10 @@ static void destroy_ai(struct ubi_attach_info *ai) list_del(&aeb->u.list); kmem_cache_free(ai->aeb_slab_cache, aeb); } + list_for_each_entry_safe(aeb, aeb_tmp, &ai->fastmap, u.list) { + list_del(&aeb->u.list); + kmem_cache_free(ai->aeb_slab_cache, aeb); + } /* Destroy the volume RB-tree */ rb = ai->volumes.rb_node; @@ -1245,7 +1309,7 @@ static int scan_all(struct ubi_device *ubi, struct ubi_attach_info *ai, cond_resched(); dbg_gen("process PEB %d", pnum); - err = scan_peb(ubi, ai, pnum, NULL, NULL); + err = scan_peb(ubi, ai, pnum, false); if (err < 0) goto out_vidh; } @@ -1311,6 +1375,7 @@ static struct ubi_attach_info *alloc_ai(void) INIT_LIST_HEAD(&ai->free); INIT_LIST_HEAD(&ai->erase); INIT_LIST_HEAD(&ai->alien); + INIT_LIST_HEAD(&ai->fastmap); ai->volumes = RB_ROOT; ai->aeb_slab_cache = kmem_cache_create("ubi_aeb_slab_cache", sizeof(struct ubi_ainf_peb), @@ -1326,7 +1391,7 @@ static struct ubi_attach_info *alloc_ai(void) #ifdef CONFIG_MTD_UBI_FASTMAP /** - * scan_fastmap - try to find a fastmap and attach from it. + * scan_fast - try to find a fastmap and attach from it. * @ubi: UBI device description object * @ai: attach info object * @@ -1337,52 +1402,58 @@ static struct ubi_attach_info *alloc_ai(void) */ static int scan_fast(struct ubi_device *ubi, struct ubi_attach_info **ai) { - int err, pnum, fm_anchor = -1; - unsigned long long max_sqnum = 0; + int err, pnum; + struct ubi_attach_info *scan_ai; err = -ENOMEM; + scan_ai = alloc_ai(); + if (!scan_ai) + goto out; + ech = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL); if (!ech) - goto out; + goto out_ai; vidh = ubi_zalloc_vid_hdr(ubi, GFP_KERNEL); if (!vidh) goto out_ech; for (pnum = 0; pnum < UBI_FM_MAX_START; pnum++) { - int vol_id = -1; - unsigned long long sqnum = -1; cond_resched(); dbg_gen("process PEB %d", pnum); - err = scan_peb(ubi, *ai, pnum, &vol_id, &sqnum); + err = scan_peb(ubi, scan_ai, pnum, true); if (err < 0) goto out_vidh; - - if (vol_id == UBI_FM_SB_VOLUME_ID && sqnum > max_sqnum) { - max_sqnum = sqnum; - fm_anchor = pnum; - } } ubi_free_vid_hdr(ubi, vidh); kfree(ech); - if (fm_anchor < 0) - return UBI_NO_FASTMAP; + if (scan_ai->force_full_scan) + err = UBI_NO_FASTMAP; + else + err = ubi_scan_fastmap(ubi, *ai, scan_ai); - destroy_ai(*ai); - *ai = alloc_ai(); - if (!*ai) - return -ENOMEM; + if (err) { + /* + * Didn't attach via fastmap, do a full scan but reuse what + * we've aready scanned. + */ + destroy_ai(*ai); + *ai = scan_ai; + } else + destroy_ai(scan_ai); - return ubi_scan_fastmap(ubi, *ai, fm_anchor); + return err; out_vidh: ubi_free_vid_hdr(ubi, vidh); out_ech: kfree(ech); +out_ai: + destroy_ai(scan_ai); out: return err; } diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index ef3618299494..0680516bb472 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -874,7 +874,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, for (i = 0; i < UBI_MAX_DEVICES; i++) { ubi = ubi_devices[i]; if (ubi && mtd->index == ubi->mtd->index) { - ubi_err(ubi, "mtd%d is already attached to ubi%d", + pr_err("ubi: mtd%d is already attached to ubi%d", mtd->index, i); return -EEXIST; } @@ -889,7 +889,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, * no sense to attach emulated MTD devices, so we prohibit this. */ if (mtd->type == MTD_UBIVOLUME) { - ubi_err(ubi, "refuse attaching mtd%d - it is already emulated on top of UBI", + pr_err("ubi: refuse attaching mtd%d - it is already emulated on top of UBI", mtd->index); return -EINVAL; } @@ -900,7 +900,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, if (!ubi_devices[ubi_num]) break; if (ubi_num == UBI_MAX_DEVICES) { - ubi_err(ubi, "only %d UBI devices may be created", + pr_err("ubi: only %d UBI devices may be created", UBI_MAX_DEVICES); return -ENFILE; } @@ -910,7 +910,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, /* Make sure ubi_num is not busy */ if (ubi_devices[ubi_num]) { - ubi_err(ubi, "already exists"); + pr_err("ubi: ubi%i already exists", ubi_num); return -EEXIST; } } @@ -992,6 +992,9 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, goto out_detach; } + /* Make device "available" before it becomes accessible via sysfs */ + ubi_devices[ubi_num] = ubi; + err = uif_init(ubi, &ref); if (err) goto out_detach; @@ -1036,7 +1039,6 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, wake_up_process(ubi->bgt_thread); spin_unlock(&ubi->wl_lock); - ubi_devices[ubi_num] = ubi; ubi_notify_all(ubi, UBI_VOLUME_ADDED, NULL); return ubi_num; @@ -1047,6 +1049,7 @@ out_uif: ubi_assert(ref); uif_close(ubi); out_detach: + ubi_devices[ubi_num] = NULL; ubi_wl_close(ubi); ubi_free_internal_volumes(ubi); vfree(ubi->vtbl); diff --git a/drivers/mtd/ubi/fastmap.c b/drivers/mtd/ubi/fastmap.c index 990898b9dc72..48eb55f344eb 100644 --- a/drivers/mtd/ubi/fastmap.c +++ b/drivers/mtd/ubi/fastmap.c @@ -15,20 +15,22 @@ */ #include <linux/crc32.h> +#include <linux/bitmap.h> #include "ubi.h" /** * init_seen - allocate memory for used for debugging. * @ubi: UBI device description object */ -static inline int *init_seen(struct ubi_device *ubi) +static inline unsigned long *init_seen(struct ubi_device *ubi) { - int *ret; + unsigned long *ret; if (!ubi_dbg_chk_fastmap(ubi)) return NULL; - ret = kcalloc(ubi->peb_count, sizeof(int), GFP_KERNEL); + ret = kcalloc(BITS_TO_LONGS(ubi->peb_count), sizeof(unsigned long), + GFP_KERNEL); if (!ret) return ERR_PTR(-ENOMEM); @@ -39,7 +41,7 @@ static inline int *init_seen(struct ubi_device *ubi) * free_seen - free the seen logic integer array. * @seen: integer array of @ubi->peb_count size */ -static inline void free_seen(int *seen) +static inline void free_seen(unsigned long *seen) { kfree(seen); } @@ -50,12 +52,12 @@ static inline void free_seen(int *seen) * @pnum: The PEB to be makred as seen * @seen: integer array of @ubi->peb_count size */ -static inline void set_seen(struct ubi_device *ubi, int pnum, int *seen) +static inline void set_seen(struct ubi_device *ubi, int pnum, unsigned long *seen) { if (!ubi_dbg_chk_fastmap(ubi) || !seen) return; - seen[pnum] = 1; + set_bit(pnum, seen); } /** @@ -63,7 +65,7 @@ static inline void set_seen(struct ubi_device *ubi, int pnum, int *seen) * @ubi: UBI device description object * @seen: integer array of @ubi->peb_count size */ -static int self_check_seen(struct ubi_device *ubi, int *seen) +static int self_check_seen(struct ubi_device *ubi, unsigned long *seen) { int pnum, ret = 0; @@ -71,7 +73,7 @@ static int self_check_seen(struct ubi_device *ubi, int *seen) return 0; for (pnum = 0; pnum < ubi->peb_count; pnum++) { - if (!seen[pnum] && ubi->lookuptbl[pnum]) { + if (test_bit(pnum, seen) && ubi->lookuptbl[pnum]) { ubi_err(ubi, "self-check failed for PEB %d, fastmap didn't see it", pnum); ret = -EINVAL; } @@ -578,7 +580,7 @@ static int count_fastmap_pebs(struct ubi_attach_info *ai) list_for_each_entry(aeb, &ai->free, u.list) n++; - ubi_rb_for_each_entry(rb1, av, &ai->volumes, rb) + ubi_rb_for_each_entry(rb1, av, &ai->volumes, rb) ubi_rb_for_each_entry(rb2, aeb, &av->root, u.rb) n++; @@ -850,27 +852,57 @@ fail: } /** + * find_fm_anchor - find the most recent Fastmap superblock (anchor) + * @ai: UBI attach info to be filled + */ +static int find_fm_anchor(struct ubi_attach_info *ai) +{ + int ret = -1; + struct ubi_ainf_peb *aeb; + unsigned long long max_sqnum = 0; + + list_for_each_entry(aeb, &ai->fastmap, u.list) { + if (aeb->vol_id == UBI_FM_SB_VOLUME_ID && aeb->sqnum > max_sqnum) { + max_sqnum = aeb->sqnum; + ret = aeb->pnum; + } + } + + return ret; +} + +/** * ubi_scan_fastmap - scan the fastmap. * @ubi: UBI device object * @ai: UBI attach info to be filled - * @fm_anchor: The fastmap starts at this PEB + * @scan_ai: UBI attach info from the first 64 PEBs, + * used to find the most recent Fastmap data structure * * Returns 0 on success, UBI_NO_FASTMAP if no fastmap was found, * UBI_BAD_FASTMAP if one was found but is not usable. * < 0 indicates an internal error. */ int ubi_scan_fastmap(struct ubi_device *ubi, struct ubi_attach_info *ai, - int fm_anchor) + struct ubi_attach_info *scan_ai) { struct ubi_fm_sb *fmsb, *fmsb2; struct ubi_vid_hdr *vh; struct ubi_ec_hdr *ech; struct ubi_fastmap_layout *fm; - int i, used_blocks, pnum, ret = 0; + struct ubi_ainf_peb *tmp_aeb, *aeb; + int i, used_blocks, pnum, fm_anchor, ret = 0; size_t fm_size; __be32 crc, tmp_crc; unsigned long long sqnum = 0; + fm_anchor = find_fm_anchor(scan_ai); + if (fm_anchor < 0) + return UBI_NO_FASTMAP; + + /* Move all (possible) fastmap blocks into our new attach structure. */ + list_for_each_entry_safe(aeb, tmp_aeb, &scan_ai->fastmap, u.list) + list_move_tail(&aeb->u.list, &ai->fastmap); + down_write(&ubi->fm_protect); memset(ubi->fm_buf, 0, ubi->fm_size); @@ -945,6 +977,13 @@ int ubi_scan_fastmap(struct ubi_device *ubi, struct ubi_attach_info *ai, goto free_hdr; } + if (i == 0 && pnum != fm_anchor) { + ubi_err(ubi, "Fastmap anchor PEB mismatch: PEB: %i vs. %i", + pnum, fm_anchor); + ret = UBI_BAD_FASTMAP; + goto free_hdr; + } + ret = ubi_io_read_ec_hdr(ubi, pnum, ech, 0); if (ret && ret != UBI_IO_BITFLIPS) { ubi_err(ubi, "unable to read fastmap block# %i EC (PEB: %i)", @@ -1102,7 +1141,7 @@ static int ubi_write_fastmap(struct ubi_device *ubi, struct rb_node *tmp_rb; int ret, i, j, free_peb_count, used_peb_count, vol_count; int scrub_peb_count, erase_peb_count; - int *seen_pebs = NULL; + unsigned long *seen_pebs = NULL; fm_raw = ubi->fm_buf; memset(ubi->fm_buf, 0, ubi->fm_size); diff --git a/drivers/mtd/ubi/gluebi.c b/drivers/mtd/ubi/gluebi.c index cb7c075f2144..1cb287ec32ad 100644 --- a/drivers/mtd/ubi/gluebi.c +++ b/drivers/mtd/ubi/gluebi.c @@ -99,9 +99,6 @@ static int gluebi_get_device(struct mtd_info *mtd) struct gluebi_device *gluebi; int ubi_mode = UBI_READONLY; - if (!try_module_get(THIS_MODULE)) - return -ENODEV; - if (mtd->flags & MTD_WRITEABLE) ubi_mode = UBI_READWRITE; @@ -129,7 +126,6 @@ static int gluebi_get_device(struct mtd_info *mtd) ubi_mode); if (IS_ERR(gluebi->desc)) { mutex_unlock(&devices_mutex); - module_put(THIS_MODULE); return PTR_ERR(gluebi->desc); } gluebi->refcnt += 1; @@ -153,7 +149,6 @@ static void gluebi_put_device(struct mtd_info *mtd) gluebi->refcnt -= 1; if (gluebi->refcnt == 0) ubi_close_volume(gluebi->desc); - module_put(THIS_MODULE); mutex_unlock(&devices_mutex); } diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c index 10cf3b549959..ff8cafe1e5cd 100644 --- a/drivers/mtd/ubi/io.c +++ b/drivers/mtd/ubi/io.c @@ -1019,7 +1019,7 @@ int ubi_io_read_vid_hdr(struct ubi_device *ubi, int pnum, p = (char *)vid_hdr - ubi->vid_hdr_shift; read_err = ubi_io_read(ubi, p, pnum, ubi->vid_hdr_aloffset, - ubi->vid_hdr_alsize); + ubi->vid_hdr_shift + UBI_VID_HDR_SIZE); if (read_err && read_err != UBI_IO_BITFLIPS && !mtd_is_eccerr(read_err)) return read_err; diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index 61d4e99755a4..b616a115c9d3 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -703,6 +703,8 @@ struct ubi_ainf_volume { * @erase: list of physical eraseblocks which have to be erased * @alien: list of physical eraseblocks which should not be used by UBI (e.g., * those belonging to "preserve"-compatible internal volumes) + * @fastmap: list of physical eraseblocks which relate to fastmap (e.g., + * eraseblocks of the current and not yet erased old fastmap blocks) * @corr_peb_count: count of PEBs in the @corr list * @empty_peb_count: count of PEBs which are presumably empty (contain only * 0xFF bytes) @@ -713,6 +715,8 @@ struct ubi_ainf_volume { * @vols_found: number of volumes found * @highest_vol_id: highest volume ID * @is_empty: flag indicating whether the MTD device is empty or not + * @force_full_scan: flag indicating whether we need to do a full scan and drop + all existing Fastmap data structures * @min_ec: lowest erase counter value * @max_ec: highest erase counter value * @max_sqnum: highest sequence number value @@ -731,6 +735,7 @@ struct ubi_attach_info { struct list_head free; struct list_head erase; struct list_head alien; + struct list_head fastmap; int corr_peb_count; int empty_peb_count; int alien_peb_count; @@ -739,6 +744,7 @@ struct ubi_attach_info { int vols_found; int highest_vol_id; int is_empty; + int force_full_scan; int min_ec; int max_ec; unsigned long long max_sqnum; @@ -911,7 +917,7 @@ int ubi_compare_lebs(struct ubi_device *ubi, const struct ubi_ainf_peb *aeb, size_t ubi_calc_fm_size(struct ubi_device *ubi); int ubi_update_fastmap(struct ubi_device *ubi); int ubi_scan_fastmap(struct ubi_device *ubi, struct ubi_attach_info *ai, - int fm_anchor); + struct ubi_attach_info *scan_ai); #else static inline int ubi_update_fastmap(struct ubi_device *ubi) { return 0; } #endif @@ -1105,4 +1111,42 @@ static inline int idx2vol_id(const struct ubi_device *ubi, int idx) return idx; } +/** + * ubi_is_fm_vol - check whether a volume ID is a Fastmap volume. + * @vol_id: volume ID + */ +static inline bool ubi_is_fm_vol(int vol_id) +{ + switch (vol_id) { + case UBI_FM_SB_VOLUME_ID: + case UBI_FM_DATA_VOLUME_ID: + return true; + } + + return false; +} + +/** + * ubi_find_fm_block - check whether a PEB is part of the current Fastmap. + * @ubi: UBI device description object + * @pnum: physical eraseblock to look for + * + * This function returns a wear leveling object if @pnum relates to the current + * fastmap, @NULL otherwise. + */ +static inline struct ubi_wl_entry *ubi_find_fm_block(const struct ubi_device *ubi, + int pnum) +{ + int i; + + if (ubi->fm) { + for (i = 0; i < ubi->fm->used_blocks; i++) { + if (ubi->fm->e[i]->pnum == pnum) + return ubi->fm->e[i]; + } + } + + return NULL; +} + #endif /* !__UBI_UBI_H__ */ diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c index 10059dfdc1b6..0138f526474a 100644 --- a/drivers/mtd/ubi/vmt.c +++ b/drivers/mtd/ubi/vmt.c @@ -488,13 +488,6 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) spin_unlock(&ubi->volumes_lock); } - /* Change volume table record */ - vtbl_rec = ubi->vtbl[vol_id]; - vtbl_rec.reserved_pebs = cpu_to_be32(reserved_pebs); - err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); - if (err) - goto out_acc; - if (pebs < 0) { for (i = 0; i < -pebs; i++) { err = ubi_eba_unmap_leb(ubi, vol, reserved_pebs + i); @@ -512,6 +505,24 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) spin_unlock(&ubi->volumes_lock); } + /* + * When we shrink a volume we have to flush all pending (erase) work. + * Otherwise it can happen that upon next attach UBI finds a LEB with + * lnum > highest_lnum and refuses to attach. + */ + if (pebs < 0) { + err = ubi_wl_flush(ubi, vol_id, UBI_ALL); + if (err) + goto out_acc; + } + + /* Change volume table record */ + vtbl_rec = ubi->vtbl[vol_id]; + vtbl_rec.reserved_pebs = cpu_to_be32(reserved_pebs); + err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); + if (err) + goto out_acc; + vol->reserved_pebs = reserved_pebs; if (vol->vol_type == UBI_DYNAMIC_VOLUME) { vol->used_ebs = reserved_pebs; diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c index 959c7b12e0b1..f4533266d7b2 100644 --- a/drivers/mtd/ubi/wl.c +++ b/drivers/mtd/ubi/wl.c @@ -1598,19 +1598,44 @@ int ubi_wl_init(struct ubi_device *ubi, struct ubi_attach_info *ai) } } - dbg_wl("found %i PEBs", found_pebs); + list_for_each_entry(aeb, &ai->fastmap, u.list) { + cond_resched(); + + e = ubi_find_fm_block(ubi, aeb->pnum); - if (ubi->fm) { - ubi_assert(ubi->good_peb_count == - found_pebs + ubi->fm->used_blocks); + if (e) { + ubi_assert(!ubi->lookuptbl[e->pnum]); + ubi->lookuptbl[e->pnum] = e; + } else { + /* + * Usually old Fastmap PEBs are scheduled for erasure + * and we don't have to care about them but if we face + * an power cut before scheduling them we need to + * take care of them here. + */ + if (ubi->lookuptbl[aeb->pnum]) + continue; + + e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL); + if (!e) + goto out_free; - for (i = 0; i < ubi->fm->used_blocks; i++) { - e = ubi->fm->e[i]; + e->pnum = aeb->pnum; + e->ec = aeb->ec; + ubi_assert(!ubi->lookuptbl[e->pnum]); ubi->lookuptbl[e->pnum] = e; + if (schedule_erase(ubi, e, aeb->vol_id, aeb->lnum, 0)) { + wl_entry_destroy(ubi, e); + goto out_free; + } } + + found_pebs++; } - else - ubi_assert(ubi->good_peb_count == found_pebs); + + dbg_wl("found %i PEBs", found_pebs); + + ubi_assert(ubi->good_peb_count == found_pebs); reserved_pebs = WL_RESERVED_PEBS; ubi_fastmap_init(ubi, &reserved_pebs); diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h index 4705e2dea423..e0ebe1378cb2 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h @@ -104,6 +104,8 @@ enum { enum CPL_error { CPL_ERR_NONE = 0, + CPL_ERR_TCAM_PARITY = 1, + CPL_ERR_TCAM_MISS = 2, CPL_ERR_TCAM_FULL = 3, CPL_ERR_BAD_LENGTH = 15, CPL_ERR_BAD_ROUTE = 18, diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index f4497cf4d06d..d728704d0c7b 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -721,6 +721,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_RSVD_LKEY_OFFSET 0x98 #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET 0xa0 #define QUERY_DEV_CAP_ETH_BACKPL_OFFSET 0x9c +#define QUERY_DEV_CAP_DIAG_RPRT_PER_PORT 0x9c #define QUERY_DEV_CAP_FW_REASSIGN_MAC 0x9d #define QUERY_DEV_CAP_VXLAN 0x9e #define QUERY_DEV_CAP_MAD_DEMUX_OFFSET 0xb0 @@ -935,6 +936,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP; if (field32 & (1 << 7)) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT; + MLX4_GET(field32, outbox, QUERY_DEV_CAP_DIAG_RPRT_PER_PORT); + if (field32 & (1 << 17)) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT; MLX4_GET(field, outbox, QUERY_DEV_CAP_FW_REASSIGN_MAC); if (field & 1<<6) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN; @@ -2457,6 +2461,42 @@ int mlx4_NOP(struct mlx4_dev *dev) MLX4_CMD_NATIVE); } +int mlx4_query_diag_counters(struct mlx4_dev *dev, u8 op_modifier, + const u32 offset[], + u32 value[], size_t array_len, u8 port) +{ + struct mlx4_cmd_mailbox *mailbox; + u32 *outbox; + size_t i; + int ret; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + outbox = mailbox->buf; + + ret = mlx4_cmd_box(dev, 0, mailbox->dma, port, op_modifier, + MLX4_CMD_DIAG_RPRT, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (ret) + goto out; + + for (i = 0; i < array_len; i++) { + if (offset[i] > MLX4_MAILBOX_SIZE) { + ret = -EINVAL; + goto out; + } + + MLX4_GET(value[i], outbox, offset[i]); + } + +out: + mlx4_free_cmd_mailbox(dev, mailbox); + return ret; +} +EXPORT_SYMBOL(mlx4_query_diag_counters); + int mlx4_get_phys_port_id(struct mlx4_dev *dev) { u8 port; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/srq.c b/drivers/net/ethernet/mellanox/mlx5/core/srq.c index 04bc522605a0..c07f4d01b70e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/srq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/srq.c @@ -63,12 +63,12 @@ void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type) complete(&srq->free); } -static int get_pas_size(void *srqc) +static int get_pas_size(struct mlx5_srq_attr *in) { - u32 log_page_size = MLX5_GET(srqc, srqc, log_page_size) + 12; - u32 log_srq_size = MLX5_GET(srqc, srqc, log_srq_size); - u32 log_rq_stride = MLX5_GET(srqc, srqc, log_rq_stride); - u32 page_offset = MLX5_GET(srqc, srqc, page_offset); + u32 log_page_size = in->log_page_size + 12; + u32 log_srq_size = in->log_size; + u32 log_rq_stride = in->wqe_shift; + u32 page_offset = in->page_offset; u32 po_quanta = 1 << (log_page_size - 6); u32 rq_sz = 1 << (log_srq_size + 4 + log_rq_stride); u32 page_size = 1 << log_page_size; @@ -78,57 +78,58 @@ static int get_pas_size(void *srqc) return rq_num_pas * sizeof(u64); } -static void rmpc_srqc_reformat(void *srqc, void *rmpc, bool srqc_to_rmpc) +static void set_wq(void *wq, struct mlx5_srq_attr *in) { - void *wq = MLX5_ADDR_OF(rmpc, rmpc, wq); - - if (srqc_to_rmpc) { - switch (MLX5_GET(srqc, srqc, state)) { - case MLX5_SRQC_STATE_GOOD: - MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY); - break; - case MLX5_SRQC_STATE_ERROR: - MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_ERR); - break; - default: - pr_warn("%s: %d: Unknown srq state = 0x%x\n", __func__, - __LINE__, MLX5_GET(srqc, srqc, state)); - MLX5_SET(rmpc, rmpc, state, MLX5_GET(srqc, srqc, state)); - } - - MLX5_SET(wq, wq, wq_signature, MLX5_GET(srqc, srqc, wq_signature)); - MLX5_SET(wq, wq, log_wq_pg_sz, MLX5_GET(srqc, srqc, log_page_size)); - MLX5_SET(wq, wq, log_wq_stride, MLX5_GET(srqc, srqc, log_rq_stride) + 4); - MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(srqc, srqc, log_srq_size)); - MLX5_SET(wq, wq, page_offset, MLX5_GET(srqc, srqc, page_offset)); - MLX5_SET(wq, wq, lwm, MLX5_GET(srqc, srqc, lwm)); - MLX5_SET(wq, wq, pd, MLX5_GET(srqc, srqc, pd)); - MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(srqc, srqc, dbr_addr)); - } else { - switch (MLX5_GET(rmpc, rmpc, state)) { - case MLX5_RMPC_STATE_RDY: - MLX5_SET(srqc, srqc, state, MLX5_SRQC_STATE_GOOD); - break; - case MLX5_RMPC_STATE_ERR: - MLX5_SET(srqc, srqc, state, MLX5_SRQC_STATE_ERROR); - break; - default: - pr_warn("%s: %d: Unknown rmp state = 0x%x\n", - __func__, __LINE__, - MLX5_GET(rmpc, rmpc, state)); - MLX5_SET(srqc, srqc, state, - MLX5_GET(rmpc, rmpc, state)); - } - - MLX5_SET(srqc, srqc, wq_signature, MLX5_GET(wq, wq, wq_signature)); - MLX5_SET(srqc, srqc, log_page_size, MLX5_GET(wq, wq, log_wq_pg_sz)); - MLX5_SET(srqc, srqc, log_rq_stride, MLX5_GET(wq, wq, log_wq_stride) - 4); - MLX5_SET(srqc, srqc, log_srq_size, MLX5_GET(wq, wq, log_wq_sz)); - MLX5_SET(srqc, srqc, page_offset, MLX5_GET(wq, wq, page_offset)); - MLX5_SET(srqc, srqc, lwm, MLX5_GET(wq, wq, lwm)); - MLX5_SET(srqc, srqc, pd, MLX5_GET(wq, wq, pd)); - MLX5_SET64(srqc, srqc, dbr_addr, MLX5_GET64(wq, wq, dbr_addr)); - } + MLX5_SET(wq, wq, wq_signature, !!(in->flags + & MLX5_SRQ_FLAG_WQ_SIG)); + MLX5_SET(wq, wq, log_wq_pg_sz, in->log_page_size); + MLX5_SET(wq, wq, log_wq_stride, in->wqe_shift + 4); + MLX5_SET(wq, wq, log_wq_sz, in->log_size); + MLX5_SET(wq, wq, page_offset, in->page_offset); + MLX5_SET(wq, wq, lwm, in->lwm); + MLX5_SET(wq, wq, pd, in->pd); + MLX5_SET64(wq, wq, dbr_addr, in->db_record); +} + +static void set_srqc(void *srqc, struct mlx5_srq_attr *in) +{ + MLX5_SET(srqc, srqc, wq_signature, !!(in->flags + & MLX5_SRQ_FLAG_WQ_SIG)); + MLX5_SET(srqc, srqc, log_page_size, in->log_page_size); + MLX5_SET(srqc, srqc, log_rq_stride, in->wqe_shift); + MLX5_SET(srqc, srqc, log_srq_size, in->log_size); + MLX5_SET(srqc, srqc, page_offset, in->page_offset); + MLX5_SET(srqc, srqc, lwm, in->lwm); + MLX5_SET(srqc, srqc, pd, in->pd); + MLX5_SET64(srqc, srqc, dbr_addr, in->db_record); + MLX5_SET(srqc, srqc, xrcd, in->xrcd); + MLX5_SET(srqc, srqc, cqn, in->cqn); +} + +static void get_wq(void *wq, struct mlx5_srq_attr *in) +{ + if (MLX5_GET(wq, wq, wq_signature)) + in->flags &= MLX5_SRQ_FLAG_WQ_SIG; + in->log_page_size = MLX5_GET(wq, wq, log_wq_pg_sz); + in->wqe_shift = MLX5_GET(wq, wq, log_wq_stride) - 4; + in->log_size = MLX5_GET(wq, wq, log_wq_sz); + in->page_offset = MLX5_GET(wq, wq, page_offset); + in->lwm = MLX5_GET(wq, wq, lwm); + in->pd = MLX5_GET(wq, wq, pd); + in->db_record = MLX5_GET64(wq, wq, dbr_addr); +} + +static void get_srqc(void *srqc, struct mlx5_srq_attr *in) +{ + if (MLX5_GET(srqc, srqc, wq_signature)) + in->flags &= MLX5_SRQ_FLAG_WQ_SIG; + in->log_page_size = MLX5_GET(srqc, srqc, log_page_size); + in->wqe_shift = MLX5_GET(srqc, srqc, log_rq_stride); + in->log_size = MLX5_GET(srqc, srqc, log_srq_size); + in->page_offset = MLX5_GET(srqc, srqc, page_offset); + in->lwm = MLX5_GET(srqc, srqc, lwm); + in->pd = MLX5_GET(srqc, srqc, pd); + in->db_record = MLX5_GET64(srqc, srqc, dbr_addr); } struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn) @@ -149,19 +150,36 @@ struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn) EXPORT_SYMBOL(mlx5_core_get_srq); static int create_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, - struct mlx5_create_srq_mbox_in *in, int inlen) + struct mlx5_srq_attr *in) { - struct mlx5_create_srq_mbox_out out; + u32 create_out[MLX5_ST_SZ_DW(create_srq_out)] = {0}; + void *create_in; + void *srqc; + void *pas; + int pas_size; + int inlen; int err; - memset(&out, 0, sizeof(out)); + pas_size = get_pas_size(in); + inlen = MLX5_ST_SZ_BYTES(create_srq_in) + pas_size; + create_in = mlx5_vzalloc(inlen); + if (!create_in) + return -ENOMEM; + + srqc = MLX5_ADDR_OF(create_srq_in, create_in, srq_context_entry); + pas = MLX5_ADDR_OF(create_srq_in, create_in, pas); - in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_SRQ); + set_srqc(srqc, in); + memcpy(pas, in->pas, pas_size); - err = mlx5_cmd_exec_check_status(dev, (u32 *)in, inlen, (u32 *)(&out), - sizeof(out)); + MLX5_SET(create_srq_in, create_in, opcode, + MLX5_CMD_OP_CREATE_SRQ); - srq->srqn = be32_to_cpu(out.srqn) & 0xffffff; + err = mlx5_cmd_exec_check_status(dev, create_in, inlen, create_out, + sizeof(create_out)); + kvfree(create_in); + if (!err) + srq->srqn = MLX5_GET(create_srq_out, create_out, srqn); return err; } @@ -169,67 +187,75 @@ static int create_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, static int destroy_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq) { - struct mlx5_destroy_srq_mbox_in in; - struct mlx5_destroy_srq_mbox_out out; + u32 srq_in[MLX5_ST_SZ_DW(destroy_srq_in)] = {0}; + u32 srq_out[MLX5_ST_SZ_DW(destroy_srq_out)] = {0}; - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_SRQ); - in.srqn = cpu_to_be32(srq->srqn); + MLX5_SET(destroy_srq_in, srq_in, opcode, + MLX5_CMD_OP_DESTROY_SRQ); + MLX5_SET(destroy_srq_in, srq_in, srqn, srq->srqn); - return mlx5_cmd_exec_check_status(dev, (u32 *)(&in), sizeof(in), - (u32 *)(&out), sizeof(out)); + return mlx5_cmd_exec_check_status(dev, srq_in, sizeof(srq_in), + srq_out, sizeof(srq_out)); } static int arm_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, u16 lwm, int is_srq) { - struct mlx5_arm_srq_mbox_in in; - struct mlx5_arm_srq_mbox_out out; - - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); + /* arm_srq structs missing using identical xrc ones */ + u32 srq_in[MLX5_ST_SZ_DW(arm_xrc_srq_in)] = {0}; + u32 srq_out[MLX5_ST_SZ_DW(arm_xrc_srq_out)] = {0}; - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ARM_RQ); - in.hdr.opmod = cpu_to_be16(!!is_srq); - in.srqn = cpu_to_be32(srq->srqn); - in.lwm = cpu_to_be16(lwm); + MLX5_SET(arm_xrc_srq_in, srq_in, opcode, MLX5_CMD_OP_ARM_XRC_SRQ); + MLX5_SET(arm_xrc_srq_in, srq_in, xrc_srqn, srq->srqn); + MLX5_SET(arm_xrc_srq_in, srq_in, lwm, lwm); - return mlx5_cmd_exec_check_status(dev, (u32 *)(&in), - sizeof(in), (u32 *)(&out), - sizeof(out)); + return mlx5_cmd_exec_check_status(dev, srq_in, sizeof(srq_in), + srq_out, sizeof(srq_out)); } static int query_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, - struct mlx5_query_srq_mbox_out *out) + struct mlx5_srq_attr *out) { - struct mlx5_query_srq_mbox_in in; + u32 srq_in[MLX5_ST_SZ_DW(query_srq_in)] = {0}; + u32 *srq_out; + void *srqc; + int err; - memset(&in, 0, sizeof(in)); + srq_out = mlx5_vzalloc(MLX5_ST_SZ_BYTES(query_srq_out)); + if (!srq_out) + return -ENOMEM; - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SRQ); - in.srqn = cpu_to_be32(srq->srqn); + MLX5_SET(query_srq_in, srq_in, opcode, + MLX5_CMD_OP_QUERY_SRQ); + MLX5_SET(query_srq_in, srq_in, srqn, srq->srqn); + err = mlx5_cmd_exec_check_status(dev, srq_in, sizeof(srq_in), + srq_out, + MLX5_ST_SZ_BYTES(query_srq_out)); + if (err) + goto out; - return mlx5_cmd_exec_check_status(dev, (u32 *)(&in), sizeof(in), - (u32 *)out, sizeof(*out)); + srqc = MLX5_ADDR_OF(query_srq_out, srq_out, srq_context_entry); + get_srqc(srqc, out); + if (MLX5_GET(srqc, srqc, state) != MLX5_SRQC_STATE_GOOD) + out->flags |= MLX5_SRQ_FLAG_ERR; +out: + kvfree(srq_out); + return err; } static int create_xrc_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, - struct mlx5_create_srq_mbox_in *in, - int srq_inlen) + struct mlx5_srq_attr *in) { u32 create_out[MLX5_ST_SZ_DW(create_xrc_srq_out)]; void *create_in; - void *srqc; void *xrc_srqc; void *pas; int pas_size; int inlen; int err; - srqc = MLX5_ADDR_OF(create_srq_in, in, srq_context_entry); - pas_size = get_pas_size(srqc); + pas_size = get_pas_size(in); inlen = MLX5_ST_SZ_BYTES(create_xrc_srq_in) + pas_size; create_in = mlx5_vzalloc(inlen); if (!create_in) @@ -239,7 +265,8 @@ static int create_xrc_srq_cmd(struct mlx5_core_dev *dev, xrc_srq_context_entry); pas = MLX5_ADDR_OF(create_xrc_srq_in, create_in, pas); - memcpy(xrc_srqc, srqc, MLX5_ST_SZ_BYTES(srqc)); + set_srqc(xrc_srqc, in); + MLX5_SET(xrc_srqc, xrc_srqc, user_index, in->user_index); memcpy(pas, in->pas, pas_size); MLX5_SET(create_xrc_srq_in, create_in, opcode, MLX5_CMD_OP_CREATE_XRC_SRQ); @@ -293,11 +320,10 @@ static int arm_xrc_srq_cmd(struct mlx5_core_dev *dev, static int query_xrc_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, - struct mlx5_query_srq_mbox_out *out) + struct mlx5_srq_attr *out) { u32 xrcsrq_in[MLX5_ST_SZ_DW(query_xrc_srq_in)]; u32 *xrcsrq_out; - void *srqc; void *xrc_srqc; int err; @@ -317,8 +343,9 @@ static int query_xrc_srq_cmd(struct mlx5_core_dev *dev, xrc_srqc = MLX5_ADDR_OF(query_xrc_srq_out, xrcsrq_out, xrc_srq_context_entry); - srqc = MLX5_ADDR_OF(query_srq_out, out, srq_context_entry); - memcpy(srqc, xrc_srqc, MLX5_ST_SZ_BYTES(srqc)); + get_srqc(xrc_srqc, out); + if (MLX5_GET(xrc_srqc, xrc_srqc, state) != MLX5_XRC_SRQC_STATE_GOOD) + out->flags |= MLX5_SRQ_FLAG_ERR; out: kvfree(xrcsrq_out); @@ -326,26 +353,27 @@ out: } static int create_rmp_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, - struct mlx5_create_srq_mbox_in *in, int srq_inlen) + struct mlx5_srq_attr *in) { void *create_in; void *rmpc; - void *srqc; + void *wq; int pas_size; int inlen; int err; - srqc = MLX5_ADDR_OF(create_srq_in, in, srq_context_entry); - pas_size = get_pas_size(srqc); + pas_size = get_pas_size(in); inlen = MLX5_ST_SZ_BYTES(create_rmp_in) + pas_size; create_in = mlx5_vzalloc(inlen); if (!create_in) return -ENOMEM; rmpc = MLX5_ADDR_OF(create_rmp_in, create_in, ctx); + wq = MLX5_ADDR_OF(rmpc, rmpc, wq); + MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY); + set_wq(wq, in); memcpy(MLX5_ADDR_OF(rmpc, rmpc, wq.pas), in->pas, pas_size); - rmpc_srqc_reformat(srqc, rmpc, true); err = mlx5_core_create_rmp(dev, create_in, inlen, &srq->srqn); @@ -390,11 +418,10 @@ static int arm_rmp_cmd(struct mlx5_core_dev *dev, } static int query_rmp_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, - struct mlx5_query_srq_mbox_out *out) + struct mlx5_srq_attr *out) { u32 *rmp_out; void *rmpc; - void *srqc; int err; rmp_out = mlx5_vzalloc(MLX5_ST_SZ_BYTES(query_rmp_out)); @@ -405,9 +432,10 @@ static int query_rmp_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, if (err) goto out; - srqc = MLX5_ADDR_OF(query_srq_out, out, srq_context_entry); rmpc = MLX5_ADDR_OF(query_rmp_out, rmp_out, rmp_context); - rmpc_srqc_reformat(srqc, rmpc, false); + get_wq(MLX5_ADDR_OF(rmpc, rmpc, wq), out); + if (MLX5_GET(rmpc, rmpc, state) != MLX5_RMPC_STATE_RDY) + out->flags |= MLX5_SRQ_FLAG_ERR; out: kvfree(rmp_out); @@ -416,15 +444,14 @@ out: static int create_srq_split(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, - struct mlx5_create_srq_mbox_in *in, - int inlen, int is_xrc) + struct mlx5_srq_attr *in) { if (!dev->issi) - return create_srq_cmd(dev, srq, in, inlen); + return create_srq_cmd(dev, srq, in); else if (srq->common.res == MLX5_RES_XSRQ) - return create_xrc_srq_cmd(dev, srq, in, inlen); + return create_xrc_srq_cmd(dev, srq, in); else - return create_rmp_cmd(dev, srq, in, inlen); + return create_rmp_cmd(dev, srq, in); } static int destroy_srq_split(struct mlx5_core_dev *dev, @@ -439,15 +466,17 @@ static int destroy_srq_split(struct mlx5_core_dev *dev, } int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, - struct mlx5_create_srq_mbox_in *in, int inlen, - int is_xrc) + struct mlx5_srq_attr *in) { int err; struct mlx5_srq_table *table = &dev->priv.srq_table; - srq->common.res = is_xrc ? MLX5_RES_XSRQ : MLX5_RES_SRQ; + if (in->type == IB_SRQT_XRC) + srq->common.res = MLX5_RES_XSRQ; + else + srq->common.res = MLX5_RES_SRQ; - err = create_srq_split(dev, srq, in, inlen, is_xrc); + err = create_srq_split(dev, srq, in); if (err) return err; @@ -502,7 +531,7 @@ int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq) EXPORT_SYMBOL(mlx5_core_destroy_srq); int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, - struct mlx5_query_srq_mbox_out *out) + struct mlx5_srq_attr *out) { if (!dev->issi) return query_srq_cmd(dev, srq, out); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c index 03a5093ffeb7..28274a6fbafe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c @@ -85,6 +85,7 @@ int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn) return err; } +EXPORT_SYMBOL(mlx5_core_create_rq); int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen) { @@ -110,6 +111,7 @@ void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn) mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); } +EXPORT_SYMBOL(mlx5_core_destroy_rq); int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out) { @@ -430,6 +432,7 @@ int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen, return err; } +EXPORT_SYMBOL(mlx5_core_create_rqt); int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in, int inlen) @@ -455,3 +458,4 @@ void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn) mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); } +EXPORT_SYMBOL(mlx5_core_destroy_rqt); diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index fa49f9143b80..a46b585fae31 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -675,6 +675,9 @@ static void acpiphp_check_bridge(struct acpiphp_bridge *bridge) if (bridge->is_going_away) return; + if (bridge->pci_dev) + pm_runtime_get_sync(&bridge->pci_dev->dev); + list_for_each_entry(slot, &bridge->slots, node) { struct pci_bus *bus = slot->bus; struct pci_dev *dev, *tmp; @@ -694,6 +697,9 @@ static void acpiphp_check_bridge(struct acpiphp_bridge *bridge) disable_slot(slot); } } + + if (bridge->pci_dev) + pm_runtime_put(&bridge->pci_dev->dev); } /* diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c index bedb361746a0..c38a5b9733c8 100644 --- a/drivers/pnp/pnpbios/core.c +++ b/drivers/pnp/pnpbios/core.c @@ -60,6 +60,7 @@ #include <linux/delay.h> #include <linux/acpi.h> #include <linux/freezer.h> +#include <linux/kmod.h> #include <linux/kthread.h> #include <asm/page.h> diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 18639e0cb6e2..e215f50794b6 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -5,6 +5,10 @@ config RTC_LIB bool +config RTC_MC146818_LIB + bool + select RTC_LIB + menuconfig RTC_CLASS bool "Real Time Clock" default n @@ -574,10 +578,10 @@ config RTC_DRV_EM3027 will be called rtc-em3027. config RTC_DRV_RV8803 - tristate "Micro Crystal RV8803" + tristate "Micro Crystal RV8803, Epson RX8900" help - If you say yes here you get support for the Micro Crystal - RV8803 RTC chips. + If you say yes here you get support for the Micro Crystal RV8803 and + Epson RX8900 RTC chips. This driver can also be built as a module. If so, the module will be called rtc-rv8803. @@ -670,6 +674,18 @@ config RTC_DRV_DS1390 This driver can also be built as a module. If so, the module will be called rtc-ds1390. +config RTC_DRV_MAX6916 + tristate "Maxim MAX6916" + help + If you say yes here you will get support for the + Maxim MAX6916 SPI RTC chip. + + This driver only supports the RTC feature, and not other chip + features such as alarms. + + This driver can also be built as a module. If so, the module + will be called rtc-max6916. + config RTC_DRV_R9701 tristate "Epson RTC-9701JE" help @@ -795,8 +811,9 @@ comment "Platform RTC drivers" config RTC_DRV_CMOS tristate "PC-style 'CMOS'" - depends on X86 || ARM || M32R || PPC || MIPS || SPARC64 + depends on X86 || ARM || M32R || PPC || MIPS || SPARC64 || MN10300 default y if X86 + select RTC_MC146818_LIB help Say "yes" here to get direct support for the real time clock found in every PC or ACPI-based system, and some other boards. @@ -815,6 +832,7 @@ config RTC_DRV_CMOS config RTC_DRV_ALPHA bool "Alpha PC-style CMOS" depends on ALPHA + select RTC_MC146818_LIB default y help Direct support for the real-time clock found on every Alpha diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index ea2833723fa9..7cf7ad559c79 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_RTC_LIB) += rtc-lib.o obj-$(CONFIG_RTC_HCTOSYS) += hctosys.o obj-$(CONFIG_RTC_SYSTOHC) += systohc.o obj-$(CONFIG_RTC_CLASS) += rtc-core.o +obj-$(CONFIG_RTC_MC146818_LIB) += rtc-mc146818-lib.o rtc-core-y := class.o interface.o ifdef CONFIG_RTC_DRV_EFI @@ -85,6 +86,7 @@ obj-$(CONFIG_RTC_DRV_M48T59) += rtc-m48t59.o obj-$(CONFIG_RTC_DRV_M48T86) += rtc-m48t86.o obj-$(CONFIG_RTC_DRV_MAX6900) += rtc-max6900.o obj-$(CONFIG_RTC_DRV_MAX6902) += rtc-max6902.o +obj-$(CONFIG_RTC_DRV_MAX6916) += rtc-max6916.o obj-$(CONFIG_RTC_DRV_MAX77686) += rtc-max77686.o obj-$(CONFIG_RTC_DRV_MAX8907) += rtc-max8907.o obj-$(CONFIG_RTC_DRV_MAX8925) += rtc-max8925.o diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index 9ef5f6f89f98..84a52db9b05f 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -104,7 +104,17 @@ static int rtc_read_alarm_internal(struct rtc_device *rtc, struct rtc_wkalrm *al else if (!rtc->ops->read_alarm) err = -EINVAL; else { - memset(alarm, 0, sizeof(struct rtc_wkalrm)); + alarm->enabled = 0; + alarm->pending = 0; + alarm->time.tm_sec = -1; + alarm->time.tm_min = -1; + alarm->time.tm_hour = -1; + alarm->time.tm_mday = -1; + alarm->time.tm_mon = -1; + alarm->time.tm_year = -1; + alarm->time.tm_wday = -1; + alarm->time.tm_yday = -1; + alarm->time.tm_isdst = -1; err = rtc->ops->read_alarm(rtc->dev.parent, alarm); } @@ -383,7 +393,7 @@ int rtc_initialize_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) rtc->aie_timer.node.expires = rtc_tm_to_ktime(alarm->time); rtc->aie_timer.period = ktime_set(0, 0); - /* Alarm has to be enabled & in the futrure for us to enqueue it */ + /* Alarm has to be enabled & in the future for us to enqueue it */ if (alarm->enabled && (rtc_tm_to_ktime(now).tv64 < rtc->aie_timer.node.expires.tv64)) { @@ -395,8 +405,6 @@ int rtc_initialize_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) } EXPORT_SYMBOL_GPL(rtc_initialize_alarm); - - int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled) { int err = mutex_lock_interruptible(&rtc->ops_lock); @@ -748,9 +756,23 @@ EXPORT_SYMBOL_GPL(rtc_irq_set_freq); */ static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer) { + struct timerqueue_node *next = timerqueue_getnext(&rtc->timerqueue); + struct rtc_time tm; + ktime_t now; + timer->enabled = 1; + __rtc_read_time(rtc, &tm); + now = rtc_tm_to_ktime(tm); + + /* Skip over expired timers */ + while (next) { + if (next->expires.tv64 >= now.tv64) + break; + next = timerqueue_iterate_next(next); + } + timerqueue_add(&rtc->timerqueue, &timer->node); - if (&timer->node == timerqueue_getnext(&rtc->timerqueue)) { + if (!next) { struct rtc_wkalrm alarm; int err; alarm.time = rtc_ktime_to_tm(timer->node.expires); diff --git a/drivers/rtc/rtc-abx80x.c b/drivers/rtc/rtc-abx80x.c index ba0d61934d35..fea9a60b06cf 100644 --- a/drivers/rtc/rtc-abx80x.c +++ b/drivers/rtc/rtc-abx80x.c @@ -643,17 +643,15 @@ static int abx80x_probe(struct i2c_client *client, return err; } - err = devm_add_action(&client->dev, rtc_calib_remove_sysfs_group, - &client->dev); - if (err) { - rtc_calib_remove_sysfs_group(&client->dev); + err = devm_add_action_or_reset(&client->dev, + rtc_calib_remove_sysfs_group, + &client->dev); + if (err) dev_err(&client->dev, "Failed to add sysfs cleanup action: %d\n", err); - return err; - } - return 0; + return err; } static int abx80x_remove(struct i2c_client *client) diff --git a/drivers/rtc/rtc-asm9260.c b/drivers/rtc/rtc-asm9260.c index 355fdb97a006..5219916ce11d 100644 --- a/drivers/rtc/rtc-asm9260.c +++ b/drivers/rtc/rtc-asm9260.c @@ -343,7 +343,6 @@ static struct platform_driver asm9260_rtc_driver = { .remove = asm9260_rtc_remove, .driver = { .name = "asm9260-rtc", - .owner = THIS_MODULE, .of_match_table = asm9260_dt_ids, }, }; diff --git a/drivers/rtc/rtc-at91sam9.c b/drivers/rtc/rtc-at91sam9.c index 99732e6f8c3b..7418a763ce52 100644 --- a/drivers/rtc/rtc-at91sam9.c +++ b/drivers/rtc/rtc-at91sam9.c @@ -375,6 +375,7 @@ static int at91_rtc_probe(struct platform_device *pdev) if (!rtc) return -ENOMEM; + spin_lock_init(&rtc->lock); rtc->irq = irq; /* platform setup code should have handled this; sigh */ diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index fbe9c72438e1..43745cac0141 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -43,7 +43,7 @@ #include <linux/of_platform.h> /* this is for "generic access to PC-style RTC" using CMOS_READ/CMOS_WRITE */ -#include <asm-generic/rtc.h> +#include <linux/mc146818rtc.h> struct cmos_rtc { struct rtc_device *rtc; @@ -190,10 +190,10 @@ static inline void cmos_write_bank2(unsigned char val, unsigned char addr) static int cmos_read_time(struct device *dev, struct rtc_time *t) { /* REVISIT: if the clock has a "century" register, use - * that instead of the heuristic in get_rtc_time(). + * that instead of the heuristic in mc146818_get_time(). * That'll make Y3K compatility (year > 2070) easy! */ - get_rtc_time(t); + mc146818_get_time(t); return 0; } @@ -205,7 +205,7 @@ static int cmos_set_time(struct device *dev, struct rtc_time *t) * takes effect exactly 500ms after we write the register. * (Also queueing and other delays before we get this far.) */ - return set_rtc_time(t); + return mc146818_set_time(t); } static int cmos_read_alarm(struct device *dev, struct rtc_wkalrm *t) @@ -220,8 +220,6 @@ static int cmos_read_alarm(struct device *dev, struct rtc_wkalrm *t) * Some also support day and month, for alarms up to a year in * the future. */ - t->time.tm_mday = -1; - t->time.tm_mon = -1; spin_lock_irq(&rtc_lock); t->time.tm_sec = CMOS_READ(RTC_SECONDS_ALARM); @@ -272,7 +270,6 @@ static int cmos_read_alarm(struct device *dev, struct rtc_wkalrm *t) } } } - t->time.tm_year = -1; t->enabled = !!(rtc_control & RTC_AIE); t->pending = 0; @@ -630,7 +627,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) address_space = 64; #elif defined(__i386__) || defined(__x86_64__) || defined(__arm__) \ || defined(__sparc__) || defined(__mips__) \ - || defined(__powerpc__) + || defined(__powerpc__) || defined(CONFIG_MN10300) address_space = 128; #else #warning Assuming 128 bytes of RTC+NVRAM address space, not 64 bytes. @@ -1142,14 +1139,14 @@ static __init void cmos_of_init(struct platform_device *pdev) if (val) CMOS_WRITE(be32_to_cpup(val), RTC_FREQ_SELECT); - get_rtc_time(&time); + cmos_read_time(&pdev->dev, &time); ret = rtc_valid_tm(&time); if (ret) { struct rtc_time def_time = { .tm_year = 1, .tm_mday = 1, }; - set_rtc_time(&def_time); + cmos_set_time(&pdev->dev, &def_time); } } #else diff --git a/drivers/rtc/rtc-da9052.c b/drivers/rtc/rtc-da9052.c index a20bcf0e33cd..4273377562ec 100644 --- a/drivers/rtc/rtc-da9052.c +++ b/drivers/rtc/rtc-da9052.c @@ -85,6 +85,7 @@ static int da9052_read_alarm(struct da9052_rtc *rtc, struct rtc_time *rtc_tm) rtc_tm->tm_mday = v[0][2] & DA9052_RTC_DAY; rtc_tm->tm_hour = v[0][1] & DA9052_RTC_HOUR; rtc_tm->tm_min = v[0][0] & DA9052_RTC_MIN; + rtc_tm->tm_sec = 0; ret = rtc_valid_tm(rtc_tm); return ret; diff --git a/drivers/rtc/rtc-da9055.c b/drivers/rtc/rtc-da9055.c index 7ec0872d5e3b..678af8648c45 100644 --- a/drivers/rtc/rtc-da9055.c +++ b/drivers/rtc/rtc-da9055.c @@ -74,6 +74,7 @@ static int da9055_read_alarm(struct da9055 *da9055, struct rtc_time *rtc_tm) rtc_tm->tm_mday = v[2] & DA9055_RTC_ALM_DAY; rtc_tm->tm_hour = v[1] & DA9055_RTC_ALM_HOUR; rtc_tm->tm_min = v[0] & DA9055_RTC_ALM_MIN; + rtc_tm->tm_sec = 0; return rtc_valid_tm(rtc_tm); } diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c index c5432bf64e1c..dba60c1dfce2 100644 --- a/drivers/rtc/rtc-davinci.c +++ b/drivers/rtc/rtc-davinci.c @@ -388,6 +388,8 @@ static int davinci_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm) u8 day0, day1; unsigned long flags; + alm->time.tm_sec = 0; + spin_lock_irqsave(&davinci_rtc_lock, flags); davinci_rtcss_calendar_wait(davinci_rtc); diff --git a/drivers/rtc/rtc-ds1286.c b/drivers/rtc/rtc-ds1286.c index 756e509f6ed2..ef75c349dff9 100644 --- a/drivers/rtc/rtc-ds1286.c +++ b/drivers/rtc/rtc-ds1286.c @@ -16,7 +16,7 @@ #include <linux/rtc.h> #include <linux/platform_device.h> #include <linux/bcd.h> -#include <linux/ds1286.h> +#include <linux/rtc/ds1286.h> #include <linux/io.h> #include <linux/slab.h> diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c index 8e41c4613e51..72b22935eb62 100644 --- a/drivers/rtc/rtc-ds1305.c +++ b/drivers/rtc/rtc-ds1305.c @@ -313,13 +313,6 @@ static int ds1305_get_alarm(struct device *dev, struct rtc_wkalrm *alm) alm->time.tm_sec = bcd2bin(buf[DS1305_SEC]); alm->time.tm_min = bcd2bin(buf[DS1305_MIN]); alm->time.tm_hour = bcd2hour(buf[DS1305_HOUR]); - alm->time.tm_mday = -1; - alm->time.tm_mon = -1; - alm->time.tm_year = -1; - /* next three fields are unused by Linux */ - alm->time.tm_wday = -1; - alm->time.tm_mday = -1; - alm->time.tm_isdst = -1; return 0; } diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index 821d9c089cdb..8e1c5cb6ece6 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c @@ -482,11 +482,6 @@ static int ds1337_read_alarm(struct device *dev, struct rtc_wkalrm *t) t->time.tm_min = bcd2bin(ds1307->regs[1] & 0x7f); t->time.tm_hour = bcd2bin(ds1307->regs[2] & 0x3f); t->time.tm_mday = bcd2bin(ds1307->regs[3] & 0x3f); - t->time.tm_mon = -1; - t->time.tm_year = -1; - t->time.tm_wday = -1; - t->time.tm_yday = -1; - t->time.tm_isdst = -1; /* ... and status */ t->enabled = !!(ds1307->regs[7] & DS1337_BIT_A1IE); @@ -602,6 +597,8 @@ static const struct rtc_class_ops ds13xx_rtc_ops = { * Alarm support for mcp794xx devices. */ +#define MCP794XX_REG_WEEKDAY 0x3 +#define MCP794XX_REG_WEEKDAY_WDAY_MASK 0x7 #define MCP794XX_REG_CONTROL 0x07 # define MCP794XX_BIT_ALM0_EN 0x10 # define MCP794XX_BIT_ALM1_EN 0x20 @@ -1231,13 +1228,16 @@ static int ds1307_probe(struct i2c_client *client, { struct ds1307 *ds1307; int err = -ENODEV; - int tmp; + int tmp, wday; struct chip_desc *chip = &chips[id->driver_data]; struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent); bool want_irq = false; bool ds1307_can_wakeup_device = false; unsigned char *buf; struct ds1307_platform_data *pdata = dev_get_platdata(&client->dev); + struct rtc_time tm; + unsigned long timestamp; + irq_handler_t irq_handler = ds1307_irq; static const int bbsqi_bitpos[] = { @@ -1526,6 +1526,27 @@ read_rtc: bin2bcd(tmp)); } + /* + * Some IPs have weekday reset value = 0x1 which might not correct + * hence compute the wday using the current date/month/year values + */ + ds1307_get_time(&client->dev, &tm); + wday = tm.tm_wday; + timestamp = rtc_tm_to_time64(&tm); + rtc_time64_to_tm(timestamp, &tm); + + /* + * Check if reset wday is different from the computed wday + * If different then set the wday which we computed using + * timestamp + */ + if (wday != tm.tm_wday) { + wday = i2c_smbus_read_byte_data(client, MCP794XX_REG_WEEKDAY); + wday = wday & ~MCP794XX_REG_WEEKDAY_WDAY_MASK; + wday = wday | (tm.tm_wday + 1); + i2c_smbus_write_byte_data(client, MCP794XX_REG_WEEKDAY, wday); + } + if (want_irq) { device_set_wakeup_capable(&client->dev, true); set_bit(HAS_ALARM, &ds1307->flags); diff --git a/drivers/rtc/rtc-ds1343.c b/drivers/rtc/rtc-ds1343.c index 23fa9f0cb5e3..895fbeeb47fe 100644 --- a/drivers/rtc/rtc-ds1343.c +++ b/drivers/rtc/rtc-ds1343.c @@ -504,12 +504,6 @@ static int ds1343_read_alarm(struct device *dev, struct rtc_wkalrm *alarm) alarm->time.tm_hour = priv->alarm_hour < 0 ? 0 : priv->alarm_hour; alarm->time.tm_mday = priv->alarm_mday < 0 ? 0 : priv->alarm_mday; - alarm->time.tm_mon = -1; - alarm->time.tm_year = -1; - alarm->time.tm_wday = -1; - alarm->time.tm_yday = -1; - alarm->time.tm_isdst = -1; - out: mutex_unlock(&priv->mutex); return res; diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c index b3ce3c652fcd..ed43b4311660 100644 --- a/drivers/rtc/rtc-ds1685.c +++ b/drivers/rtc/rtc-ds1685.c @@ -103,6 +103,26 @@ ds1685_rtc_bin2bcd(struct ds1685_priv *rtc, u8 val, u8 bin_mask, u8 bcd_mask) } /** + * s1685_rtc_check_mday - check validity of the day of month. + * @rtc: pointer to the ds1685 rtc structure. + * @mday: day of month. + * + * Returns -EDOM if the day of month is not within 1..31 range. + */ +static inline int +ds1685_rtc_check_mday(struct ds1685_priv *rtc, u8 mday) +{ + if (rtc->bcd_mode) { + if (mday < 0x01 || mday > 0x31 || (mday & 0x0f) > 0x09) + return -EDOM; + } else { + if (mday < 1 || mday > 31) + return -EDOM; + } + return 0; +} + +/** * ds1685_rtc_switch_to_bank0 - switch the rtc to bank 0. * @rtc: pointer to the ds1685 rtc structure. */ @@ -377,6 +397,7 @@ ds1685_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) struct platform_device *pdev = to_platform_device(dev); struct ds1685_priv *rtc = platform_get_drvdata(pdev); u8 seconds, minutes, hours, mday, ctrlb, ctrlc; + int ret; /* Fetch the alarm info from the RTC alarm registers. */ ds1685_rtc_begin_data_access(rtc); @@ -388,34 +409,29 @@ ds1685_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) ctrlc = rtc->read(rtc, RTC_CTRL_C); ds1685_rtc_end_data_access(rtc); - /* Check month date. */ - if (!(mday >= 1) && (mday <= 31)) - return -EDOM; + /* Check the month date for validity. */ + ret = ds1685_rtc_check_mday(rtc, mday); + if (ret) + return ret; /* * Check the three alarm bytes. * * The Linux RTC system doesn't support the "don't care" capability * of this RTC chip. We check for it anyways in case support is - * added in the future. + * added in the future and only assign when we care. */ - if (unlikely(seconds >= 0xc0)) - alrm->time.tm_sec = -1; - else + if (likely(seconds < 0xc0)) alrm->time.tm_sec = ds1685_rtc_bcd2bin(rtc, seconds, RTC_SECS_BCD_MASK, RTC_SECS_BIN_MASK); - if (unlikely(minutes >= 0xc0)) - alrm->time.tm_min = -1; - else + if (likely(minutes < 0xc0)) alrm->time.tm_min = ds1685_rtc_bcd2bin(rtc, minutes, RTC_MINS_BCD_MASK, RTC_MINS_BIN_MASK); - if (unlikely(hours >= 0xc0)) - alrm->time.tm_hour = -1; - else + if (likely(hours < 0xc0)) alrm->time.tm_hour = ds1685_rtc_bcd2bin(rtc, hours, RTC_HRS_24_BCD_MASK, RTC_HRS_24_BIN_MASK); @@ -423,11 +439,6 @@ ds1685_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) /* Write the data to rtc_wkalrm. */ alrm->time.tm_mday = ds1685_rtc_bcd2bin(rtc, mday, RTC_MDAY_BCD_MASK, RTC_MDAY_BIN_MASK); - alrm->time.tm_mon = -1; - alrm->time.tm_year = -1; - alrm->time.tm_wday = -1; - alrm->time.tm_yday = -1; - alrm->time.tm_isdst = -1; alrm->enabled = !!(ctrlb & RTC_CTRL_B_AIE); alrm->pending = !!(ctrlc & RTC_CTRL_C_AF); @@ -445,6 +456,7 @@ ds1685_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) struct platform_device *pdev = to_platform_device(dev); struct ds1685_priv *rtc = platform_get_drvdata(pdev); u8 ctrlb, seconds, minutes, hours, mday; + int ret; /* Fetch the alarm info and convert to BCD. */ seconds = ds1685_rtc_bin2bcd(rtc, alrm->time.tm_sec, @@ -461,8 +473,9 @@ ds1685_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) RTC_MDAY_BCD_MASK); /* Check the month date for validity. */ - if (!(mday >= 1) && (mday <= 31)) - return -EDOM; + ret = ds1685_rtc_check_mday(rtc, mday); + if (ret) + return ret; /* * Check the three alarm bytes. diff --git a/drivers/rtc/rtc-ds2404.c b/drivers/rtc/rtc-ds2404.c index 16310fe79d76..9a1582ed7070 100644 --- a/drivers/rtc/rtc-ds2404.c +++ b/drivers/rtc/rtc-ds2404.c @@ -13,7 +13,7 @@ #include <linux/rtc.h> #include <linux/types.h> #include <linux/bcd.h> -#include <linux/rtc-ds2404.h> +#include <linux/platform_data/rtc-ds2404.h> #include <linux/delay.h> #include <linux/gpio.h> #include <linux/slab.h> diff --git a/drivers/rtc/rtc-ds3232.c b/drivers/rtc/rtc-ds3232.c index 04fbd7fffd0d..b1f20d8c358f 100644 --- a/drivers/rtc/rtc-ds3232.c +++ b/drivers/rtc/rtc-ds3232.c @@ -197,12 +197,6 @@ static int ds3232_read_alarm(struct device *dev, struct rtc_wkalrm *alarm) alarm->time.tm_hour = bcd2bin(buf[2] & 0x7F); alarm->time.tm_mday = bcd2bin(buf[3] & 0x7F); - alarm->time.tm_mon = -1; - alarm->time.tm_year = -1; - alarm->time.tm_wday = -1; - alarm->time.tm_yday = -1; - alarm->time.tm_isdst = -1; - alarm->enabled = !!(control & DS3232_REG_CR_A1IE); alarm->pending = !!(stat & DS3232_REG_SR_A1F); diff --git a/drivers/rtc/rtc-efi.c b/drivers/rtc/rtc-efi.c index 96d38609d803..0130afd7fe88 100644 --- a/drivers/rtc/rtc-efi.c +++ b/drivers/rtc/rtc-efi.c @@ -259,6 +259,12 @@ static const struct rtc_class_ops efi_rtc_ops = { static int __init efi_rtc_probe(struct platform_device *dev) { struct rtc_device *rtc; + efi_time_t eft; + efi_time_cap_t cap; + + /* First check if the RTC is usable */ + if (efi.get_time(&eft, &cap) != EFI_SUCCESS) + return -ENODEV; rtc = devm_rtc_device_register(&dev->dev, "rtc-efi", &efi_rtc_ops, THIS_MODULE); diff --git a/drivers/rtc/rtc-generic.c b/drivers/rtc/rtc-generic.c index d726c6aa96a8..1bf5d2347928 100644 --- a/drivers/rtc/rtc-generic.c +++ b/drivers/rtc/rtc-generic.c @@ -9,44 +9,10 @@ #include <linux/platform_device.h> #include <linux/rtc.h> -#if defined(CONFIG_M68K) || defined(CONFIG_PARISC) || \ - defined(CONFIG_PPC) || defined(CONFIG_SUPERH32) -#include <asm/rtc.h> - -static int generic_get_time(struct device *dev, struct rtc_time *tm) -{ - unsigned int ret = get_rtc_time(tm); - - if (ret & RTC_BATT_BAD) - return -EOPNOTSUPP; - - return rtc_valid_tm(tm); -} - -static int generic_set_time(struct device *dev, struct rtc_time *tm) -{ - if (set_rtc_time(tm) < 0) - return -EOPNOTSUPP; - - return 0; -} - -static const struct rtc_class_ops generic_rtc_ops = { - .read_time = generic_get_time, - .set_time = generic_set_time, -}; -#else -#define generic_rtc_ops *(struct rtc_class_ops*)NULL -#endif - static int __init generic_rtc_probe(struct platform_device *dev) { struct rtc_device *rtc; - const struct rtc_class_ops *ops; - - ops = dev_get_platdata(&dev->dev); - if (!ops) - ops = &generic_rtc_ops; + const struct rtc_class_ops *ops = dev_get_platdata(&dev->dev); rtc = devm_rtc_device_register(&dev->dev, "rtc-generic", ops, THIS_MODULE); diff --git a/drivers/rtc/rtc-hym8563.c b/drivers/rtc/rtc-hym8563.c index 207270376b55..e5ad527cb75e 100644 --- a/drivers/rtc/rtc-hym8563.c +++ b/drivers/rtc/rtc-hym8563.c @@ -198,7 +198,7 @@ static int hym8563_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm) return ret; /* The alarm only has a minute accuracy */ - alm_tm->tm_sec = -1; + alm_tm->tm_sec = 0; alm_tm->tm_min = (buf[0] & HYM8563_ALM_BIT_DISABLE) ? -1 : @@ -213,9 +213,6 @@ static int hym8563_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm) -1 : bcd2bin(buf[3] & HYM8563_WEEKDAY_MASK); - alm_tm->tm_mon = -1; - alm_tm->tm_year = -1; - ret = i2c_smbus_read_byte_data(client, HYM8563_CTL2); if (ret < 0) return ret; diff --git a/drivers/rtc/rtc-isl12057.c b/drivers/rtc/rtc-isl12057.c index 54328d4ac0d3..0e7f0f52bfe4 100644 --- a/drivers/rtc/rtc-isl12057.c +++ b/drivers/rtc/rtc-isl12057.c @@ -245,8 +245,7 @@ static int isl12057_rtc_update_alarm(struct device *dev, int enable) static int isl12057_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alarm) { struct isl12057_rtc_data *data = dev_get_drvdata(dev); - struct rtc_time rtc_tm, *alarm_tm = &alarm->time; - unsigned long rtc_secs, alarm_secs; + struct rtc_time *alarm_tm = &alarm->time; u8 regs[ISL12057_A1_SEC_LEN]; unsigned int ir; int ret; @@ -264,36 +263,6 @@ static int isl12057_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alarm) alarm_tm->tm_min = bcd2bin(regs[1] & 0x7f); alarm_tm->tm_hour = bcd2bin(regs[2] & 0x3f); alarm_tm->tm_mday = bcd2bin(regs[3] & 0x3f); - alarm_tm->tm_wday = -1; - - /* - * The alarm section does not store year/month. We use the ones in rtc - * section as a basis and increment month and then year if needed to get - * alarm after current time. - */ - ret = _isl12057_rtc_read_time(dev, &rtc_tm); - if (ret) - goto err_unlock; - - alarm_tm->tm_year = rtc_tm.tm_year; - alarm_tm->tm_mon = rtc_tm.tm_mon; - - ret = rtc_tm_to_time(&rtc_tm, &rtc_secs); - if (ret) - goto err_unlock; - - ret = rtc_tm_to_time(alarm_tm, &alarm_secs); - if (ret) - goto err_unlock; - - if (alarm_secs < rtc_secs) { - if (alarm_tm->tm_mon == 11) { - alarm_tm->tm_mon = 0; - alarm_tm->tm_year += 1; - } else { - alarm_tm->tm_mon += 1; - } - } ret = regmap_read(data->regmap, ISL12057_REG_INT, &ir); if (ret) { diff --git a/drivers/rtc/rtc-m41t80.c b/drivers/rtc/rtc-m41t80.c index d1bf93a87200..58698d21c2c3 100644 --- a/drivers/rtc/rtc-m41t80.c +++ b/drivers/rtc/rtc-m41t80.c @@ -244,7 +244,7 @@ static int m41t80_alarm_irq_enable(struct device *dev, unsigned int enabled) retval = i2c_smbus_write_byte_data(client, M41T80_REG_ALARM_MON, flags); if (retval < 0) { - dev_info(dev, "Unable to enable alarm IRQ %d\n", retval); + dev_err(dev, "Unable to enable alarm IRQ %d\n", retval); return retval; } return 0; @@ -320,10 +320,8 @@ static int m41t80_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) alrm->time.tm_sec = bcd2bin(alarmvals[4] & 0x7f); alrm->time.tm_min = bcd2bin(alarmvals[3] & 0x7f); alrm->time.tm_hour = bcd2bin(alarmvals[2] & 0x3f); - alrm->time.tm_wday = -1; alrm->time.tm_mday = bcd2bin(alarmvals[1] & 0x3f); alrm->time.tm_mon = bcd2bin(alarmvals[0] & 0x3f); - alrm->time.tm_year = -1; alrm->enabled = !!(alarmvals[0] & M41T80_ALMON_AFE); alrm->pending = (flags & M41T80_FLAGS_AF) && alrm->enabled; @@ -337,6 +335,30 @@ static struct rtc_class_ops m41t80_rtc_ops = { .proc = m41t80_rtc_proc, }; +#ifdef CONFIG_PM_SLEEP +static int m41t80_suspend(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + + if (client->irq >= 0 && device_may_wakeup(dev)) + enable_irq_wake(client->irq); + + return 0; +} + +static int m41t80_resume(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + + if (client->irq >= 0 && device_may_wakeup(dev)) + disable_irq_wake(client->irq); + + return 0; +} +#endif + +static SIMPLE_DEV_PM_OPS(m41t80_pm, m41t80_suspend, m41t80_resume); + static ssize_t flags_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -831,10 +853,9 @@ static int m41t80_probe(struct i2c_client *client, return rc; } - rc = devm_add_action(&client->dev, m41t80_remove_sysfs_group, - &client->dev); + rc = devm_add_action_or_reset(&client->dev, m41t80_remove_sysfs_group, + &client->dev); if (rc) { - m41t80_remove_sysfs_group(&client->dev); dev_err(&client->dev, "Failed to add sysfs cleanup action: %d\n", rc); return rc; @@ -873,6 +894,7 @@ static int m41t80_remove(struct i2c_client *client) static struct i2c_driver m41t80_driver = { .driver = { .name = "rtc-m41t80", + .pm = &m41t80_pm, }, .probe = m41t80_probe, .remove = m41t80_remove, diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c index f72b91f2501f..0eeb5714c00f 100644 --- a/drivers/rtc/rtc-m48t86.c +++ b/drivers/rtc/rtc-m48t86.c @@ -16,7 +16,7 @@ #include <linux/module.h> #include <linux/rtc.h> #include <linux/platform_device.h> -#include <linux/m48t86.h> +#include <linux/platform_data/rtc-m48t86.h> #include <linux/bcd.h> #define M48T86_REG_SEC 0x00 diff --git a/drivers/rtc/rtc-max6916.c b/drivers/rtc/rtc-max6916.c new file mode 100644 index 000000000000..623ab27b2757 --- /dev/null +++ b/drivers/rtc/rtc-max6916.c @@ -0,0 +1,164 @@ +/* rtc-max6916.c + * + * Driver for MAXIM max6916 Low Current, SPI Compatible + * Real Time Clock + * + * Author : Venkat Prashanth B U <venkat.prashanth2498@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/platform_device.h> +#include <linux/rtc.h> +#include <linux/spi/spi.h> +#include <linux/bcd.h> + +/* Registers in max6916 rtc */ + +#define MAX6916_SECONDS_REG 0x01 +#define MAX6916_MINUTES_REG 0x02 +#define MAX6916_HOURS_REG 0x03 +#define MAX6916_DATE_REG 0x04 +#define MAX6916_MONTH_REG 0x05 +#define MAX6916_DAY_REG 0x06 +#define MAX6916_YEAR_REG 0x07 +#define MAX6916_CONTROL_REG 0x08 +#define MAX6916_STATUS_REG 0x0C +#define MAX6916_CLOCK_BURST 0x3F + +static int max6916_read_reg(struct device *dev, unsigned char address, + unsigned char *data) +{ + struct spi_device *spi = to_spi_device(dev); + + *data = address | 0x80; + + return spi_write_then_read(spi, data, 1, data, 1); +} + +static int max6916_write_reg(struct device *dev, unsigned char address, + unsigned char data) +{ + struct spi_device *spi = to_spi_device(dev); + unsigned char buf[2]; + + buf[0] = address & 0x7F; + buf[1] = data; + + return spi_write_then_read(spi, buf, 2, NULL, 0); +} + +static int max6916_read_time(struct device *dev, struct rtc_time *dt) +{ + struct spi_device *spi = to_spi_device(dev); + int err; + unsigned char buf[8]; + + buf[0] = MAX6916_CLOCK_BURST | 0x80; + + err = spi_write_then_read(spi, buf, 1, buf, 8); + + if (err) + return err; + + dt->tm_sec = bcd2bin(buf[0]); + dt->tm_min = bcd2bin(buf[1]); + dt->tm_hour = bcd2bin(buf[2] & 0x3F); + dt->tm_mday = bcd2bin(buf[3]); + dt->tm_mon = bcd2bin(buf[4]) - 1; + dt->tm_wday = bcd2bin(buf[5]) - 1; + dt->tm_year = bcd2bin(buf[6]) + 100; + + return rtc_valid_tm(dt); +} + +static int max6916_set_time(struct device *dev, struct rtc_time *dt) +{ + struct spi_device *spi = to_spi_device(dev); + unsigned char buf[9]; + + if (dt->tm_year < 100 || dt->tm_year > 199) { + dev_err(&spi->dev, "Year must be between 2000 and 2099. It's %d.\n", + dt->tm_year + 1900); + return -EINVAL; + } + + buf[0] = MAX6916_CLOCK_BURST & 0x7F; + buf[1] = bin2bcd(dt->tm_sec); + buf[2] = bin2bcd(dt->tm_min); + buf[3] = (bin2bcd(dt->tm_hour) & 0X3F); + buf[4] = bin2bcd(dt->tm_mday); + buf[5] = bin2bcd(dt->tm_mon + 1); + buf[6] = bin2bcd(dt->tm_wday + 1); + buf[7] = bin2bcd(dt->tm_year % 100); + buf[8] = bin2bcd(0x00); + + /* write the rtc settings */ + return spi_write_then_read(spi, buf, 9, NULL, 0); +} + +static const struct rtc_class_ops max6916_rtc_ops = { + .read_time = max6916_read_time, + .set_time = max6916_set_time, +}; + +static int max6916_probe(struct spi_device *spi) +{ + struct rtc_device *rtc; + unsigned char data; + int res; + + /* spi setup with max6916 in mode 3 and bits per word as 8 */ + spi->mode = SPI_MODE_3; + spi->bits_per_word = 8; + spi_setup(spi); + + /* RTC Settings */ + res = max6916_read_reg(&spi->dev, MAX6916_SECONDS_REG, &data); + if (res) + return res; + + /* Disable the write protect of rtc */ + max6916_read_reg(&spi->dev, MAX6916_CONTROL_REG, &data); + data = data & ~(1 << 7); + max6916_write_reg(&spi->dev, MAX6916_CONTROL_REG, data); + + /*Enable oscillator,disable oscillator stop flag, glitch filter*/ + max6916_read_reg(&spi->dev, MAX6916_STATUS_REG, &data); + data = data & 0x1B; + max6916_write_reg(&spi->dev, MAX6916_STATUS_REG, data); + + /* display the settings */ + max6916_read_reg(&spi->dev, MAX6916_CONTROL_REG, &data); + dev_info(&spi->dev, "MAX6916 RTC CTRL Reg = 0x%02x\n", data); + + max6916_read_reg(&spi->dev, MAX6916_STATUS_REG, &data); + dev_info(&spi->dev, "MAX6916 RTC Status Reg = 0x%02x\n", data); + + rtc = devm_rtc_device_register(&spi->dev, "max6916", + &max6916_rtc_ops, THIS_MODULE); + if (IS_ERR(rtc)) + return PTR_ERR(rtc); + + spi_set_drvdata(spi, rtc); + + return 0; +} + +static struct spi_driver max6916_driver = { + .driver = { + .name = "max6916", + }, + .probe = max6916_probe, +}; +module_spi_driver(max6916_driver); + +MODULE_DESCRIPTION("MAX6916 SPI RTC DRIVER"); +MODULE_AUTHOR("Venkat Prashanth B U <venkat.prashanth2498@gmail.com>"); +MODULE_LICENSE("GPL v2"); diff --git a/include/asm-generic/rtc.h b/drivers/rtc/rtc-mc146818-lib.c index 4e3b6558331e..2f1772a358ca 100644 --- a/include/asm-generic/rtc.h +++ b/drivers/rtc/rtc-mc146818-lib.c @@ -1,40 +1,16 @@ -/* - * include/asm-generic/rtc.h - * - * Author: Tom Rini <trini@mvista.com> - * - * Based on: - * drivers/char/rtc.c - * - * Please read the COPYING file for all license details. - */ - -#ifndef __ASM_RTC_H__ -#define __ASM_RTC_H__ - -#include <linux/mc146818rtc.h> -#include <linux/rtc.h> #include <linux/bcd.h> #include <linux/delay.h> +#include <linux/export.h> +#include <linux/mc146818rtc.h> + #ifdef CONFIG_ACPI #include <linux/acpi.h> #endif -#define RTC_PIE 0x40 /* periodic interrupt enable */ -#define RTC_AIE 0x20 /* alarm interrupt enable */ -#define RTC_UIE 0x10 /* update-finished interrupt enable */ - -/* some dummy definitions */ -#define RTC_BATT_BAD 0x100 /* battery bad */ -#define RTC_SQWE 0x08 /* enable square-wave output */ -#define RTC_DM_BINARY 0x04 /* all time/date values are BCD if clear */ -#define RTC_24H 0x02 /* 24 hour mode - else hours bit 7 means pm */ -#define RTC_DST_EN 0x01 /* auto switch DST - works f. USA only */ - /* * Returns true if a clock update is in progress */ -static inline unsigned char rtc_is_updating(void) +static inline unsigned char mc146818_is_updating(void) { unsigned char uip; unsigned long flags; @@ -45,7 +21,7 @@ static inline unsigned char rtc_is_updating(void) return uip; } -static inline unsigned int __get_rtc_time(struct rtc_time *time) +unsigned int mc146818_get_time(struct rtc_time *time) { unsigned char ctrl; unsigned long flags; @@ -60,11 +36,11 @@ static inline unsigned int __get_rtc_time(struct rtc_time *time) * can take just over 2ms. We wait 20ms. There is no need to * to poll-wait (up to 1s - eeccch) for the falling edge of RTC_UIP. * If you need to know *exactly* when a second has started, enable - * periodic update complete interrupts, (via ioctl) and then + * periodic update complete interrupts, (via ioctl) and then * immediately read /dev/rtc which will block until you get the IRQ. * Once the read clears, read the RTC time (again via ioctl). Easy. */ - if (rtc_is_updating()) + if (mc146818_is_updating()) mdelay(20); /* @@ -120,13 +96,10 @@ static inline unsigned int __get_rtc_time(struct rtc_time *time) return RTC_24H; } - -#ifndef get_rtc_time -#define get_rtc_time __get_rtc_time -#endif +EXPORT_SYMBOL_GPL(mc146818_get_time); /* Set the current date and time in the real time clock. */ -static inline int __set_rtc_time(struct rtc_time *time) +int mc146818_set_time(struct rtc_time *time) { unsigned long flags; unsigned char mon, day, hrs, min, sec; @@ -222,26 +195,4 @@ static inline int __set_rtc_time(struct rtc_time *time) return 0; } - -#ifndef set_rtc_time -#define set_rtc_time __set_rtc_time -#endif - -static inline unsigned int get_rtc_ss(void) -{ - struct rtc_time h; - - get_rtc_time(&h); - return h.tm_sec; -} - -static inline int get_rtc_pll(struct rtc_pll_info *pll) -{ - return -EINVAL; -} -static inline int set_rtc_pll(struct rtc_pll_info *pll) -{ - return -EINVAL; -} - -#endif /* __ASM_RTC_H__ */ +EXPORT_SYMBOL_GPL(mc146818_set_time); diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c index 0094d9bdd1e6..7334c44fa7c3 100644 --- a/drivers/rtc/rtc-mrst.c +++ b/drivers/rtc/rtc-mrst.c @@ -32,11 +32,11 @@ #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/kernel.h> +#include <linux/mc146818rtc.h> #include <linux/module.h> #include <linux/init.h> #include <linux/sfi.h> -#include <asm-generic/rtc.h> #include <asm/intel_scu_ipc.h> #include <asm/intel-mid.h> #include <asm/intel_mid_vrtc.h> @@ -149,14 +149,6 @@ static int mrst_read_alarm(struct device *dev, struct rtc_wkalrm *t) if (mrst->irq <= 0) return -EIO; - /* Basic alarms only support hour, minute, and seconds fields. - * Some also support day and month, for alarms up to a year in - * the future. - */ - t->time.tm_mday = -1; - t->time.tm_mon = -1; - t->time.tm_year = -1; - /* vRTC only supports binary mode */ spin_lock_irq(&rtc_lock); t->time.tm_sec = vrtc_cmos_read(RTC_SECONDS_ALARM); diff --git a/drivers/rtc/rtc-pcf2123.c b/drivers/rtc/rtc-pcf2123.c index f22e060709e5..b4478cc92b55 100644 --- a/drivers/rtc/rtc-pcf2123.c +++ b/drivers/rtc/rtc-pcf2123.c @@ -96,7 +96,7 @@ #define CD_TMR_TE BIT(3) /* Countdown timer enable */ /* PCF2123_REG_OFFSET BITS */ -#define OFFSET_SIGN_BIT BIT(6) /* 2's complement sign bit */ +#define OFFSET_SIGN_BIT 6 /* 2's complement sign bit */ #define OFFSET_COARSE BIT(7) /* Coarse mode offset */ #define OFFSET_STEP (2170) /* Offset step in parts per billion */ @@ -217,7 +217,7 @@ static int pcf2123_read_offset(struct device *dev, long *offset) if (reg & OFFSET_COARSE) reg <<= 1; /* multiply by 2 and sign extend */ else - reg |= (reg & OFFSET_SIGN_BIT) << 1; /* sign extend only */ + reg = sign_extend32(reg, OFFSET_SIGN_BIT); *offset = ((long)reg) * OFFSET_STEP; diff --git a/drivers/rtc/rtc-pcf85063.c b/drivers/rtc/rtc-pcf85063.c index e8ddbb359d11..efb0a08ac117 100644 --- a/drivers/rtc/rtc-pcf85063.c +++ b/drivers/rtc/rtc-pcf85063.c @@ -16,6 +16,16 @@ #include <linux/rtc.h> #include <linux/module.h> +/* + * Information for this driver was pulled from the following datasheets. + * + * http://www.nxp.com/documents/data_sheet/PCF85063A.pdf + * http://www.nxp.com/documents/data_sheet/PCF85063TP.pdf + * + * PCF85063A -- Rev. 6 — 18 November 2015 + * PCF85063TP -- Rev. 4 — 6 May 2015 +*/ + #define PCF85063_REG_CTRL1 0x00 /* status */ #define PCF85063_REG_CTRL1_STOP BIT(5) #define PCF85063_REG_CTRL2 0x01 @@ -55,10 +65,22 @@ static int pcf85063_stop_clock(struct i2c_client *client, u8 *ctrl1) return 0; } -/* - * In the routines that deal directly with the pcf85063 hardware, we use - * rtc_time -- month 0-11, hour 0-23, yr = calendar year-epoch. - */ +static int pcf85063_start_clock(struct i2c_client *client, u8 ctrl1) +{ + s32 ret; + + /* start the clock */ + ctrl1 &= PCF85063_REG_CTRL1_STOP; + + ret = i2c_smbus_write_byte_data(client, PCF85063_REG_CTRL1, ctrl1); + if (ret < 0) { + dev_err(&client->dev, "Failing to start the clock\n"); + return -EIO; + } + + return 0; +} + static int pcf85063_get_datetime(struct i2c_client *client, struct rtc_time *tm) { int rc; @@ -90,8 +112,7 @@ static int pcf85063_get_datetime(struct i2c_client *client, struct rtc_time *tm) tm->tm_wday = regs[4] & 0x07; tm->tm_mon = bcd2bin(regs[5] & 0x1F) - 1; /* rtc mn 1-12 */ tm->tm_year = bcd2bin(regs[6]); - if (tm->tm_year < 70) - tm->tm_year += 100; /* assume we are in 1970...2069 */ + tm->tm_year += 100; return rtc_valid_tm(tm); } @@ -99,13 +120,17 @@ static int pcf85063_get_datetime(struct i2c_client *client, struct rtc_time *tm) static int pcf85063_set_datetime(struct i2c_client *client, struct rtc_time *tm) { int rc; - u8 regs[8]; + u8 regs[7]; + u8 ctrl1; + + if ((tm->tm_year < 100) || (tm->tm_year > 199)) + return -EINVAL; /* * to accurately set the time, reset the divider chain and keep it in * reset state until all time/date registers are written */ - rc = pcf85063_stop_clock(client, ®s[7]); + rc = pcf85063_stop_clock(client, &ctrl1); if (rc != 0) return rc; @@ -125,14 +150,7 @@ static int pcf85063_set_datetime(struct i2c_client *client, struct rtc_time *tm) regs[5] = bin2bcd(tm->tm_mon + 1); /* year and century */ - regs[6] = bin2bcd(tm->tm_year % 100); - - /* - * after all time/date registers are written, let the 'address auto - * increment' feature wrap around and write register CTRL1 to re-enable - * the clock divider chain again - */ - regs[7] &= ~PCF85063_REG_CTRL1_STOP; + regs[6] = bin2bcd(tm->tm_year - 100); /* write all registers at once */ rc = i2c_smbus_write_i2c_block_data(client, PCF85063_REG_SC, @@ -142,6 +160,15 @@ static int pcf85063_set_datetime(struct i2c_client *client, struct rtc_time *tm) return rc; } + /* + * Write the control register as a separate action since the size of + * the register space is different between the PCF85063TP and + * PCF85063A devices. The rollover point can not be used. + */ + rc = pcf85063_start_clock(client, ctrl1); + if (rc != 0) + return rc; + return 0; } diff --git a/drivers/rtc/rtc-pcf8563.c b/drivers/rtc/rtc-pcf8563.c index b9ddbb001283..1227ceab61ee 100644 --- a/drivers/rtc/rtc-pcf8563.c +++ b/drivers/rtc/rtc-pcf8563.c @@ -341,14 +341,11 @@ static int pcf8563_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *tm) "%s: raw data is min=%02x, hr=%02x, mday=%02x, wday=%02x\n", __func__, buf[0], buf[1], buf[2], buf[3]); + tm->time.tm_sec = 0; tm->time.tm_min = bcd2bin(buf[0] & 0x7F); tm->time.tm_hour = bcd2bin(buf[1] & 0x3F); tm->time.tm_mday = bcd2bin(buf[2] & 0x3F); tm->time.tm_wday = bcd2bin(buf[3] & 0x7); - tm->time.tm_mon = -1; - tm->time.tm_year = -1; - tm->time.tm_yday = -1; - tm->time.tm_isdst = -1; err = pcf8563_get_alarm_mode(client, &tm->enabled, &tm->pending); if (err < 0) diff --git a/drivers/rtc/rtc-rc5t583.c b/drivers/rtc/rtc-rc5t583.c index f28d57788951..68ce77414bdc 100644 --- a/drivers/rtc/rtc-rc5t583.c +++ b/drivers/rtc/rtc-rc5t583.c @@ -128,6 +128,7 @@ static int rc5t583_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm) return ret; } + alm->time.tm_sec = 0; alm->time.tm_min = bcd2bin(alarm_data[0]); alm->time.tm_hour = bcd2bin(alarm_data[1]); alm->time.tm_mday = bcd2bin(alarm_data[2]); diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c index ef86229428fc..c8c757466783 100644 --- a/drivers/rtc/rtc-rs5c372.c +++ b/drivers/rtc/rtc-rs5c372.c @@ -341,12 +341,6 @@ static int rs5c_read_alarm(struct device *dev, struct rtc_wkalrm *t) t->time.tm_sec = 0; t->time.tm_min = bcd2bin(rs5c->regs[RS5C_REG_ALARM_A_MIN] & 0x7f); t->time.tm_hour = rs5c_reg2hr(rs5c, rs5c->regs[RS5C_REG_ALARM_A_HOURS]); - t->time.tm_mday = -1; - t->time.tm_mon = -1; - t->time.tm_year = -1; - t->time.tm_wday = -1; - t->time.tm_yday = -1; - t->time.tm_isdst = -1; /* ... and status */ t->enabled = !!(rs5c->regs[RS5C_REG_CTRL1] & RS5C_CTRL1_AALE); diff --git a/drivers/rtc/rtc-rv8803.c b/drivers/rtc/rtc-rv8803.c index f623038e586e..9a2f6a95d5a7 100644 --- a/drivers/rtc/rtc-rv8803.c +++ b/drivers/rtc/rtc-rv8803.c @@ -13,12 +13,15 @@ #include <linux/bcd.h> #include <linux/bitops.h> +#include <linux/log2.h> #include <linux/i2c.h> #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/rtc.h> +#define RV8803_I2C_TRY_COUNT 4 + #define RV8803_SEC 0x00 #define RV8803_MIN 0x01 #define RV8803_HOUR 0x02 @@ -56,19 +59,85 @@ struct rv8803_data { u8 ctrl; }; +static int rv8803_read_reg(const struct i2c_client *client, u8 reg) +{ + int try = RV8803_I2C_TRY_COUNT; + s32 ret; + + /* + * There is a 61µs window during which the RTC does not acknowledge I2C + * transfers. In that case, ensure that there are multiple attempts. + */ + do + ret = i2c_smbus_read_byte_data(client, reg); + while ((ret == -ENXIO || ret == -EIO) && --try); + if (ret < 0) + dev_err(&client->dev, "Unable to read register 0x%02x\n", reg); + + return ret; +} + +static int rv8803_read_regs(const struct i2c_client *client, + u8 reg, u8 count, u8 *values) +{ + int try = RV8803_I2C_TRY_COUNT; + s32 ret; + + do + ret = i2c_smbus_read_i2c_block_data(client, reg, count, values); + while ((ret == -ENXIO || ret == -EIO) && --try); + if (ret != count) { + dev_err(&client->dev, + "Unable to read registers 0x%02x..0x%02x\n", + reg, reg + count - 1); + return ret < 0 ? ret : -EIO; + } + + return 0; +} + +static int rv8803_write_reg(const struct i2c_client *client, u8 reg, u8 value) +{ + int try = RV8803_I2C_TRY_COUNT; + s32 ret; + + do + ret = i2c_smbus_write_byte_data(client, reg, value); + while ((ret == -ENXIO || ret == -EIO) && --try); + if (ret) + dev_err(&client->dev, "Unable to write register 0x%02x\n", reg); + + return ret; +} + +static int rv8803_write_regs(const struct i2c_client *client, + u8 reg, u8 count, const u8 *values) +{ + int try = RV8803_I2C_TRY_COUNT; + s32 ret; + + do + ret = i2c_smbus_write_i2c_block_data(client, reg, count, + values); + while ((ret == -ENXIO || ret == -EIO) && --try); + if (ret) + dev_err(&client->dev, + "Unable to write registers 0x%02x..0x%02x\n", + reg, reg + count - 1); + + return ret; +} + static irqreturn_t rv8803_handle_irq(int irq, void *dev_id) { struct i2c_client *client = dev_id; struct rv8803_data *rv8803 = i2c_get_clientdata(client); unsigned long events = 0; - int flags, try = 0; + int flags; mutex_lock(&rv8803->flags_lock); - do { - flags = i2c_smbus_read_byte_data(client, RV8803_FLAG); - try++; - } while ((flags == -ENXIO) && (try < 3)); + flags = rv8803_read_reg(client, RV8803_FLAG); if (flags <= 0) { mutex_unlock(&rv8803->flags_lock); return IRQ_NONE; @@ -100,9 +169,8 @@ static irqreturn_t rv8803_handle_irq(int irq, void *dev_id) if (events) { rtc_update_irq(rv8803->rtc, 1, events); - i2c_smbus_write_byte_data(client, RV8803_FLAG, flags); - i2c_smbus_write_byte_data(rv8803->client, RV8803_CTRL, - rv8803->ctrl); + rv8803_write_reg(client, RV8803_FLAG, flags); + rv8803_write_reg(rv8803->client, RV8803_CTRL, rv8803->ctrl); } mutex_unlock(&rv8803->flags_lock); @@ -118,7 +186,7 @@ static int rv8803_get_time(struct device *dev, struct rtc_time *tm) u8 *date = date1; int ret, flags; - flags = i2c_smbus_read_byte_data(rv8803->client, RV8803_FLAG); + flags = rv8803_read_reg(rv8803->client, RV8803_FLAG); if (flags < 0) return flags; @@ -127,16 +195,14 @@ static int rv8803_get_time(struct device *dev, struct rtc_time *tm) return -EINVAL; } - ret = i2c_smbus_read_i2c_block_data(rv8803->client, RV8803_SEC, - 7, date); - if (ret != 7) - return ret < 0 ? ret : -EIO; + ret = rv8803_read_regs(rv8803->client, RV8803_SEC, 7, date); + if (ret) + return ret; if ((date1[RV8803_SEC] & 0x7f) == bin2bcd(59)) { - ret = i2c_smbus_read_i2c_block_data(rv8803->client, RV8803_SEC, - 7, date2); - if (ret != 7) - return ret < 0 ? ret : -EIO; + ret = rv8803_read_regs(rv8803->client, RV8803_SEC, 7, date2); + if (ret) + return ret; if ((date2[RV8803_SEC] & 0x7f) != bin2bcd(59)) date = date2; @@ -145,23 +211,33 @@ static int rv8803_get_time(struct device *dev, struct rtc_time *tm) tm->tm_sec = bcd2bin(date[RV8803_SEC] & 0x7f); tm->tm_min = bcd2bin(date[RV8803_MIN] & 0x7f); tm->tm_hour = bcd2bin(date[RV8803_HOUR] & 0x3f); - tm->tm_wday = ffs(date[RV8803_WEEK] & 0x7f); + tm->tm_wday = ilog2(date[RV8803_WEEK] & 0x7f); tm->tm_mday = bcd2bin(date[RV8803_DAY] & 0x3f); tm->tm_mon = bcd2bin(date[RV8803_MONTH] & 0x1f) - 1; tm->tm_year = bcd2bin(date[RV8803_YEAR]) + 100; - return rtc_valid_tm(tm); + return 0; } static int rv8803_set_time(struct device *dev, struct rtc_time *tm) { struct rv8803_data *rv8803 = dev_get_drvdata(dev); u8 date[7]; - int flags, ret; + int ctrl, flags, ret; if ((tm->tm_year < 100) || (tm->tm_year > 199)) return -EINVAL; + ctrl = rv8803_read_reg(rv8803->client, RV8803_CTRL); + if (ctrl < 0) + return ctrl; + + /* Stop the clock */ + ret = rv8803_write_reg(rv8803->client, RV8803_CTRL, + ctrl | RV8803_CTRL_RESET); + if (ret) + return ret; + date[RV8803_SEC] = bin2bcd(tm->tm_sec); date[RV8803_MIN] = bin2bcd(tm->tm_min); date[RV8803_HOUR] = bin2bcd(tm->tm_hour); @@ -170,21 +246,26 @@ static int rv8803_set_time(struct device *dev, struct rtc_time *tm) date[RV8803_MONTH] = bin2bcd(tm->tm_mon + 1); date[RV8803_YEAR] = bin2bcd(tm->tm_year - 100); - ret = i2c_smbus_write_i2c_block_data(rv8803->client, RV8803_SEC, - 7, date); - if (ret < 0) + ret = rv8803_write_regs(rv8803->client, RV8803_SEC, 7, date); + if (ret) + return ret; + + /* Restart the clock */ + ret = rv8803_write_reg(rv8803->client, RV8803_CTRL, + ctrl & ~RV8803_CTRL_RESET); + if (ret) return ret; mutex_lock(&rv8803->flags_lock); - flags = i2c_smbus_read_byte_data(rv8803->client, RV8803_FLAG); + flags = rv8803_read_reg(rv8803->client, RV8803_FLAG); if (flags < 0) { mutex_unlock(&rv8803->flags_lock); return flags; } - ret = i2c_smbus_write_byte_data(rv8803->client, RV8803_FLAG, - flags & ~RV8803_FLAG_V2F); + ret = rv8803_write_reg(rv8803->client, RV8803_FLAG, + flags & ~(RV8803_FLAG_V1F | RV8803_FLAG_V2F)); mutex_unlock(&rv8803->flags_lock); @@ -198,22 +279,18 @@ static int rv8803_get_alarm(struct device *dev, struct rtc_wkalrm *alrm) u8 alarmvals[3]; int flags, ret; - ret = i2c_smbus_read_i2c_block_data(client, RV8803_ALARM_MIN, - 3, alarmvals); - if (ret != 3) - return ret < 0 ? ret : -EIO; + ret = rv8803_read_regs(client, RV8803_ALARM_MIN, 3, alarmvals); + if (ret) + return ret; - flags = i2c_smbus_read_byte_data(client, RV8803_FLAG); + flags = rv8803_read_reg(client, RV8803_FLAG); if (flags < 0) return flags; alrm->time.tm_sec = 0; alrm->time.tm_min = bcd2bin(alarmvals[0] & 0x7f); alrm->time.tm_hour = bcd2bin(alarmvals[1] & 0x3f); - alrm->time.tm_wday = -1; alrm->time.tm_mday = bcd2bin(alarmvals[2] & 0x3f); - alrm->time.tm_mon = -1; - alrm->time.tm_year = -1; alrm->enabled = !!(rv8803->ctrl & RV8803_CTRL_AIE); alrm->pending = (flags & RV8803_FLAG_AF) && alrm->enabled; @@ -239,10 +316,10 @@ static int rv8803_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) mutex_lock(&rv8803->flags_lock); - ret = i2c_smbus_read_i2c_block_data(client, RV8803_FLAG, 2, ctrl); - if (ret != 2) { + ret = rv8803_read_regs(client, RV8803_FLAG, 2, ctrl); + if (ret) { mutex_unlock(&rv8803->flags_lock); - return ret < 0 ? ret : -EIO; + return ret; } alarmvals[0] = bin2bcd(alrm->time.tm_min); @@ -251,8 +328,8 @@ static int rv8803_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) if (rv8803->ctrl & (RV8803_CTRL_AIE | RV8803_CTRL_UIE)) { rv8803->ctrl &= ~(RV8803_CTRL_AIE | RV8803_CTRL_UIE); - err = i2c_smbus_write_byte_data(rv8803->client, RV8803_CTRL, - rv8803->ctrl); + err = rv8803_write_reg(rv8803->client, RV8803_CTRL, + rv8803->ctrl); if (err) { mutex_unlock(&rv8803->flags_lock); return err; @@ -260,13 +337,12 @@ static int rv8803_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) } ctrl[1] &= ~RV8803_FLAG_AF; - err = i2c_smbus_write_byte_data(rv8803->client, RV8803_FLAG, ctrl[1]); + err = rv8803_write_reg(rv8803->client, RV8803_FLAG, ctrl[1]); mutex_unlock(&rv8803->flags_lock); if (err) return err; - err = i2c_smbus_write_i2c_block_data(rv8803->client, RV8803_ALARM_MIN, - 3, alarmvals); + err = rv8803_write_regs(rv8803->client, RV8803_ALARM_MIN, 3, alarmvals); if (err) return err; @@ -276,8 +352,8 @@ static int rv8803_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) if (rv8803->rtc->aie_timer.enabled) rv8803->ctrl |= RV8803_CTRL_AIE; - err = i2c_smbus_write_byte_data(rv8803->client, RV8803_CTRL, - rv8803->ctrl); + err = rv8803_write_reg(rv8803->client, RV8803_CTRL, + rv8803->ctrl); if (err) return err; } @@ -306,21 +382,20 @@ static int rv8803_alarm_irq_enable(struct device *dev, unsigned int enabled) } mutex_lock(&rv8803->flags_lock); - flags = i2c_smbus_read_byte_data(client, RV8803_FLAG); + flags = rv8803_read_reg(client, RV8803_FLAG); if (flags < 0) { mutex_unlock(&rv8803->flags_lock); return flags; } flags &= ~(RV8803_FLAG_AF | RV8803_FLAG_UF); - err = i2c_smbus_write_byte_data(client, RV8803_FLAG, flags); + err = rv8803_write_reg(client, RV8803_FLAG, flags); mutex_unlock(&rv8803->flags_lock); if (err) return err; if (ctrl != rv8803->ctrl) { rv8803->ctrl = ctrl; - err = i2c_smbus_write_byte_data(client, RV8803_CTRL, - rv8803->ctrl); + err = rv8803_write_reg(client, RV8803_CTRL, rv8803->ctrl); if (err) return err; } @@ -336,7 +411,7 @@ static int rv8803_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) switch (cmd) { case RTC_VL_READ: - flags = i2c_smbus_read_byte_data(client, RV8803_FLAG); + flags = rv8803_read_reg(client, RV8803_FLAG); if (flags < 0) return flags; @@ -355,16 +430,16 @@ static int rv8803_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) case RTC_VL_CLR: mutex_lock(&rv8803->flags_lock); - flags = i2c_smbus_read_byte_data(client, RV8803_FLAG); + flags = rv8803_read_reg(client, RV8803_FLAG); if (flags < 0) { mutex_unlock(&rv8803->flags_lock); return flags; } flags &= ~(RV8803_FLAG_V1F | RV8803_FLAG_V2F); - ret = i2c_smbus_write_byte_data(client, RV8803_FLAG, flags); + ret = rv8803_write_reg(client, RV8803_FLAG, flags); mutex_unlock(&rv8803->flags_lock); - if (ret < 0) + if (ret) return ret; return 0; @@ -382,8 +457,8 @@ static ssize_t rv8803_nvram_write(struct file *filp, struct kobject *kobj, struct i2c_client *client = to_i2c_client(dev); int ret; - ret = i2c_smbus_write_byte_data(client, RV8803_RAM, buf[0]); - if (ret < 0) + ret = rv8803_write_reg(client, RV8803_RAM, buf[0]); + if (ret) return ret; return 1; @@ -397,7 +472,7 @@ static ssize_t rv8803_nvram_read(struct file *filp, struct kobject *kobj, struct i2c_client *client = to_i2c_client(dev); int ret; - ret = i2c_smbus_read_byte_data(client, RV8803_RAM); + ret = rv8803_read_reg(client, RV8803_RAM); if (ret < 0) return ret; @@ -427,7 +502,7 @@ static int rv8803_probe(struct i2c_client *client, { struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent); struct rv8803_data *rv8803; - int err, flags, try = 0; + int err, flags; if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_I2C_BLOCK)) { @@ -444,16 +519,7 @@ static int rv8803_probe(struct i2c_client *client, rv8803->client = client; i2c_set_clientdata(client, rv8803); - /* - * There is a 60µs window where the RTC may not reply on the i2c bus in - * that case, the transfer is not ACKed. In that case, ensure there are - * multiple attempts. - */ - do { - flags = i2c_smbus_read_byte_data(client, RV8803_FLAG); - try++; - } while ((flags == -ENXIO) && (try < 3)); - + flags = rv8803_read_reg(client, RV8803_FLAG); if (flags < 0) return flags; @@ -488,12 +554,7 @@ static int rv8803_probe(struct i2c_client *client, return PTR_ERR(rv8803->rtc); } - try = 0; - do { - err = i2c_smbus_write_byte_data(rv8803->client, RV8803_EXT, - RV8803_EXT_WADA); - try++; - } while ((err == -ENXIO) && (try < 3)); + err = rv8803_write_reg(rv8803->client, RV8803_EXT, RV8803_EXT_WADA); if (err) return err; diff --git a/drivers/rtc/rtc-rx8010.c b/drivers/rtc/rtc-rx8010.c index 772d221ec2d9..7163b91bb773 100644 --- a/drivers/rtc/rtc-rx8010.c +++ b/drivers/rtc/rtc-rx8010.c @@ -272,15 +272,9 @@ static int rx8010_read_alarm(struct device *dev, struct rtc_wkalrm *t) t->time.tm_min = bcd2bin(alarmvals[0] & 0x7f); t->time.tm_hour = bcd2bin(alarmvals[1] & 0x3f); - if (alarmvals[2] & RX8010_ALARM_AE) - t->time.tm_mday = -1; - else + if (!(alarmvals[2] & RX8010_ALARM_AE)) t->time.tm_mday = bcd2bin(alarmvals[2] & 0x7f); - t->time.tm_wday = -1; - t->time.tm_mon = -1; - t->time.tm_year = -1; - t->enabled = !!(rx8010->ctrlreg & RX8010_CTRL_AIE); t->pending = (flagreg & RX8010_FLAG_AF) && t->enabled; diff --git a/drivers/rtc/rtc-rx8025.c b/drivers/rtc/rtc-rx8025.c index 9f105efbc546..2b85cc7a24e7 100644 --- a/drivers/rtc/rtc-rx8025.c +++ b/drivers/rtc/rtc-rx8025.c @@ -319,11 +319,6 @@ static int rx8025_read_alarm(struct device *dev, struct rtc_wkalrm *t) t->time.tm_hour = bcd2bin(ald[1] & 0x1f) % 12 + (ald[1] & 0x20 ? 12 : 0); - t->time.tm_wday = -1; - t->time.tm_mday = -1; - t->time.tm_mon = -1; - t->time.tm_year = -1; - dev_dbg(dev, "%s: date: %ds %dm %dh %dmd %dm %dy\n", __func__, t->time.tm_sec, t->time.tm_min, t->time.tm_hour, diff --git a/drivers/rtc/rtc-s35390a.c b/drivers/rtc/rtc-s35390a.c index f40afdd0e5f5..5dab4665ca3b 100644 --- a/drivers/rtc/rtc-s35390a.c +++ b/drivers/rtc/rtc-s35390a.c @@ -15,6 +15,7 @@ #include <linux/bitrev.h> #include <linux/bcd.h> #include <linux/slab.h> +#include <linux/delay.h> #define S35390A_CMD_STATUS1 0 #define S35390A_CMD_STATUS2 1 @@ -34,10 +35,14 @@ #define S35390A_ALRM_BYTE_HOURS 1 #define S35390A_ALRM_BYTE_MINS 2 +/* flags for STATUS1 */ #define S35390A_FLAG_POC 0x01 #define S35390A_FLAG_BLD 0x02 +#define S35390A_FLAG_INT2 0x04 #define S35390A_FLAG_24H 0x40 #define S35390A_FLAG_RESET 0x80 + +/* flag for STATUS2 */ #define S35390A_FLAG_TEST 0x01 #define S35390A_INT2_MODE_MASK 0xF0 @@ -94,19 +99,63 @@ static int s35390a_get_reg(struct s35390a *s35390a, int reg, char *buf, int len) return 0; } -static int s35390a_reset(struct s35390a *s35390a) +/* + * Returns <0 on error, 0 if rtc is setup fine and 1 if the chip was reset. + * To keep the information if an irq is pending, pass the value read from + * STATUS1 to the caller. + */ +static int s35390a_reset(struct s35390a *s35390a, char *status1) { - char buf[1]; - - if (s35390a_get_reg(s35390a, S35390A_CMD_STATUS1, buf, sizeof(buf)) < 0) - return -EIO; - - if (!(buf[0] & (S35390A_FLAG_POC | S35390A_FLAG_BLD))) + char buf; + int ret; + unsigned initcount = 0; + + ret = s35390a_get_reg(s35390a, S35390A_CMD_STATUS1, status1, 1); + if (ret < 0) + return ret; + + if (*status1 & S35390A_FLAG_POC) + /* + * Do not communicate for 0.5 seconds since the power-on + * detection circuit is in operation. + */ + msleep(500); + else if (!(*status1 & S35390A_FLAG_BLD)) + /* + * If both POC and BLD are unset everything is fine. + */ return 0; - buf[0] |= (S35390A_FLAG_RESET | S35390A_FLAG_24H); - buf[0] &= 0xf0; - return s35390a_set_reg(s35390a, S35390A_CMD_STATUS1, buf, sizeof(buf)); + /* + * At least one of POC and BLD are set, so reinitialise chip. Keeping + * this information in the hardware to know later that the time isn't + * valid is unfortunately not possible because POC and BLD are cleared + * on read. So the reset is best done now. + * + * The 24H bit is kept over reset, so set it already here. + */ +initialize: + *status1 = S35390A_FLAG_24H; + buf = S35390A_FLAG_RESET | S35390A_FLAG_24H; + ret = s35390a_set_reg(s35390a, S35390A_CMD_STATUS1, &buf, 1); + + if (ret < 0) + return ret; + + ret = s35390a_get_reg(s35390a, S35390A_CMD_STATUS1, &buf, 1); + if (ret < 0) + return ret; + + if (buf & (S35390A_FLAG_POC | S35390A_FLAG_BLD)) { + /* Try up to five times to reset the chip */ + if (initcount < 5) { + ++initcount; + goto initialize; + } else + return -EIO; + } + + return 1; } static int s35390a_disable_test_mode(struct s35390a *s35390a) @@ -217,12 +266,12 @@ static int s35390a_set_alarm(struct i2c_client *client, struct rtc_wkalrm *alm) alm->time.tm_min, alm->time.tm_hour, alm->time.tm_mday, alm->time.tm_mon, alm->time.tm_year, alm->time.tm_wday); - /* disable interrupt */ + /* disable interrupt (which deasserts the irq line) */ err = s35390a_set_reg(s35390a, S35390A_CMD_STATUS2, &sts, sizeof(sts)); if (err < 0) return err; - /* clear pending interrupt, if any */ + /* clear pending interrupt (in STATUS1 only), if any */ err = s35390a_get_reg(s35390a, S35390A_CMD_STATUS1, &sts, sizeof(sts)); if (err < 0) return err; @@ -242,6 +291,8 @@ static int s35390a_set_alarm(struct i2c_client *client, struct rtc_wkalrm *alm) if (alm->time.tm_wday != -1) buf[S35390A_ALRM_BYTE_WDAY] = bin2bcd(alm->time.tm_wday) | 0x80; + else + buf[S35390A_ALRM_BYTE_WDAY] = 0; buf[S35390A_ALRM_BYTE_HOURS] = s35390a_hr2reg(s35390a, alm->time.tm_hour) | 0x80; @@ -269,23 +320,43 @@ static int s35390a_read_alarm(struct i2c_client *client, struct rtc_wkalrm *alm) if (err < 0) return err; - if (bitrev8(sts) != S35390A_INT2_MODE_ALARM) - return -EINVAL; + if ((bitrev8(sts) & S35390A_INT2_MODE_MASK) != S35390A_INT2_MODE_ALARM) { + /* + * When the alarm isn't enabled, the register to configure + * the alarm time isn't accessible. + */ + alm->enabled = 0; + return 0; + } else { + alm->enabled = 1; + } err = s35390a_get_reg(s35390a, S35390A_CMD_INT2_REG1, buf, sizeof(buf)); if (err < 0) return err; /* This chip returns the bits of each byte in reverse order */ - for (i = 0; i < 3; ++i) { + for (i = 0; i < 3; ++i) buf[i] = bitrev8(buf[i]); - buf[i] &= ~0x80; - } - alm->time.tm_wday = bcd2bin(buf[S35390A_ALRM_BYTE_WDAY]); - alm->time.tm_hour = s35390a_reg2hr(s35390a, - buf[S35390A_ALRM_BYTE_HOURS]); - alm->time.tm_min = bcd2bin(buf[S35390A_ALRM_BYTE_MINS]); + /* + * B0 of the three matching registers is an enable flag. Iff it is set + * the configured value is used for matching. + */ + if (buf[S35390A_ALRM_BYTE_WDAY] & 0x80) + alm->time.tm_wday = + bcd2bin(buf[S35390A_ALRM_BYTE_WDAY] & ~0x80); + + if (buf[S35390A_ALRM_BYTE_HOURS] & 0x80) + alm->time.tm_hour = + s35390a_reg2hr(s35390a, + buf[S35390A_ALRM_BYTE_HOURS] & ~0x80); + + if (buf[S35390A_ALRM_BYTE_MINS] & 0x80) + alm->time.tm_min = bcd2bin(buf[S35390A_ALRM_BYTE_MINS] & ~0x80); + + /* alarm triggers always at s=0 */ + alm->time.tm_sec = 0; dev_dbg(&client->dev, "%s: alm is mins=%d, hours=%d, wday=%d\n", __func__, alm->time.tm_min, alm->time.tm_hour, @@ -327,11 +398,11 @@ static struct i2c_driver s35390a_driver; static int s35390a_probe(struct i2c_client *client, const struct i2c_device_id *id) { - int err; + int err, err_reset; unsigned int i; struct s35390a *s35390a; struct rtc_time tm; - char buf[1]; + char buf, status1; if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) { err = -ENODEV; @@ -360,29 +431,35 @@ static int s35390a_probe(struct i2c_client *client, } } - err = s35390a_reset(s35390a); - if (err < 0) { + err_reset = s35390a_reset(s35390a, &status1); + if (err_reset < 0) { + err = err_reset; dev_err(&client->dev, "error resetting chip\n"); goto exit_dummy; } - err = s35390a_disable_test_mode(s35390a); - if (err < 0) { - dev_err(&client->dev, "error disabling test mode\n"); - goto exit_dummy; - } - - err = s35390a_get_reg(s35390a, S35390A_CMD_STATUS1, buf, sizeof(buf)); - if (err < 0) { - dev_err(&client->dev, "error checking 12/24 hour mode\n"); - goto exit_dummy; - } - if (buf[0] & S35390A_FLAG_24H) + if (status1 & S35390A_FLAG_24H) s35390a->twentyfourhour = 1; else s35390a->twentyfourhour = 0; - if (s35390a_get_datetime(client, &tm) < 0) + if (status1 & S35390A_FLAG_INT2) { + /* disable alarm (and maybe test mode) */ + buf = 0; + err = s35390a_set_reg(s35390a, S35390A_CMD_STATUS2, &buf, 1); + if (err < 0) { + dev_err(&client->dev, "error disabling alarm"); + goto exit_dummy; + } + } else { + err = s35390a_disable_test_mode(s35390a); + if (err < 0) { + dev_err(&client->dev, "error disabling test mode\n"); + goto exit_dummy; + } + } + + if (err_reset > 0 || s35390a_get_datetime(client, &tm) < 0) dev_warn(&client->dev, "clock needs to be set\n"); device_set_wakeup_capable(&client->dev, 1); @@ -395,6 +472,10 @@ static int s35390a_probe(struct i2c_client *client, err = PTR_ERR(s35390a->rtc); goto exit_dummy; } + + if (status1 & S35390A_FLAG_INT2) + rtc_update_irq(s35390a->rtc, 1, RTC_AF); + return 0; exit_dummy: diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c index d01ad7e8078e..d44fb34df8fe 100644 --- a/drivers/rtc/rtc-s3c.c +++ b/drivers/rtc/rtc-s3c.c @@ -149,12 +149,14 @@ static int s3c_rtc_setfreq(struct s3c_rtc *info, int freq) if (!is_power_of_2(freq)) return -EINVAL; + s3c_rtc_enable_clk(info); spin_lock_irq(&info->pie_lock); if (info->data->set_freq) info->data->set_freq(info, freq); spin_unlock_irq(&info->pie_lock); + s3c_rtc_disable_clk(info); return 0; } @@ -264,35 +266,23 @@ static int s3c_rtc_getalarm(struct device *dev, struct rtc_wkalrm *alrm) /* decode the alarm enable field */ if (alm_en & S3C2410_RTCALM_SECEN) alm_tm->tm_sec = bcd2bin(alm_tm->tm_sec); - else - alm_tm->tm_sec = -1; if (alm_en & S3C2410_RTCALM_MINEN) alm_tm->tm_min = bcd2bin(alm_tm->tm_min); - else - alm_tm->tm_min = -1; if (alm_en & S3C2410_RTCALM_HOUREN) alm_tm->tm_hour = bcd2bin(alm_tm->tm_hour); - else - alm_tm->tm_hour = -1; if (alm_en & S3C2410_RTCALM_DAYEN) alm_tm->tm_mday = bcd2bin(alm_tm->tm_mday); - else - alm_tm->tm_mday = -1; if (alm_en & S3C2410_RTCALM_MONEN) { alm_tm->tm_mon = bcd2bin(alm_tm->tm_mon); alm_tm->tm_mon -= 1; - } else { - alm_tm->tm_mon = -1; } if (alm_en & S3C2410_RTCALM_YEAREN) alm_tm->tm_year = bcd2bin(alm_tm->tm_year); - else - alm_tm->tm_year = -1; return 0; } @@ -577,8 +567,6 @@ static int s3c_rtc_probe(struct platform_device *pdev) s3c_rtc_setfreq(info, 1); - s3c_rtc_disable_clk(info); - return 0; err_nortc: diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c index a45845a571e5..17b6235d67a5 100644 --- a/drivers/rtc/rtc-sh.c +++ b/drivers/rtc/rtc-sh.c @@ -481,7 +481,6 @@ static int sh_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *wkalrm) tm->tm_mon = sh_rtc_read_alarm_value(rtc, RMONAR); if (tm->tm_mon > 0) tm->tm_mon -= 1; /* RTC is 1-12, tm_mon is 0-11 */ - tm->tm_year = 0xffff; wkalrm->enabled = (readb(rtc->regbase + RCR1) & RCR1_AIE) ? 1 : 0; @@ -500,52 +499,13 @@ static inline void sh_rtc_write_alarm_value(struct sh_rtc *rtc, writeb(bin2bcd(value) | AR_ENB, rtc->regbase + reg_off); } -static int sh_rtc_check_alarm(struct rtc_time *tm) -{ - /* - * The original rtc says anything > 0xc0 is "don't care" or "match - * all" - most users use 0xff but rtc-dev uses -1 for the same thing. - * The original rtc doesn't support years - some things use -1 and - * some 0xffff. We use -1 to make out tests easier. - */ - if (tm->tm_year == 0xffff) - tm->tm_year = -1; - if (tm->tm_mon >= 0xff) - tm->tm_mon = -1; - if (tm->tm_mday >= 0xff) - tm->tm_mday = -1; - if (tm->tm_wday >= 0xff) - tm->tm_wday = -1; - if (tm->tm_hour >= 0xff) - tm->tm_hour = -1; - if (tm->tm_min >= 0xff) - tm->tm_min = -1; - if (tm->tm_sec >= 0xff) - tm->tm_sec = -1; - - if (tm->tm_year > 9999 || - tm->tm_mon >= 12 || - tm->tm_mday == 0 || tm->tm_mday >= 32 || - tm->tm_wday >= 7 || - tm->tm_hour >= 24 || - tm->tm_min >= 60 || - tm->tm_sec >= 60) - return -EINVAL; - - return 0; -} - static int sh_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm) { struct platform_device *pdev = to_platform_device(dev); struct sh_rtc *rtc = platform_get_drvdata(pdev); unsigned int rcr1; struct rtc_time *tm = &wkalrm->time; - int mon, err; - - err = sh_rtc_check_alarm(tm); - if (unlikely(err < 0)) - return err; + int mon; spin_lock_irq(&rtc->lock); diff --git a/drivers/rtc/rtc-tegra.c b/drivers/rtc/rtc-tegra.c index 60232bd366ef..15ac597d54da 100644 --- a/drivers/rtc/rtc-tegra.c +++ b/drivers/rtc/rtc-tegra.c @@ -179,12 +179,6 @@ static int tegra_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alarm) if (sec == 0) { /* alarm is disabled. */ alarm->enabled = 0; - alarm->time.tm_mon = -1; - alarm->time.tm_mday = -1; - alarm->time.tm_year = -1; - alarm->time.tm_hour = -1; - alarm->time.tm_min = -1; - alarm->time.tm_sec = -1; } else { /* alarm is enabled. */ alarm->enabled = 1; diff --git a/drivers/rtc/rtc-v3020.c b/drivers/rtc/rtc-v3020.c index 7a0436329d6c..1f3117b5a83c 100644 --- a/drivers/rtc/rtc-v3020.c +++ b/drivers/rtc/rtc-v3020.c @@ -25,7 +25,7 @@ #include <linux/rtc.h> #include <linux/types.h> #include <linux/bcd.h> -#include <linux/rtc-v3020.h> +#include <linux/platform_data/rtc-v3020.h> #include <linux/delay.h> #include <linux/gpio.h> #include <linux/slab.h> diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 1918f5483b23..7d1b4317eccc 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -838,6 +838,23 @@ config SCSI_IBMVSCSI To compile this driver as a module, choose M here: the module will be called ibmvscsi. +config SCSI_IBMVSCSIS + tristate "IBM Virtual SCSI Server support" + depends on PPC_PSERIES && TARGET_CORE && SCSI && PCI + help + This is the IBM POWER Virtual SCSI Target Server + This driver uses the SRP protocol for communication betwen servers + guest and/or the host that run on the same server. + More information on VSCSI protocol can be found at www.power.org + + The userspace configuration needed to initialize the driver can be + be found here: + + https://github.com/powervm/ibmvscsis/wiki/Configuration + + To compile this driver as a module, choose M here: the + module will be called ibmvscsis. + config SCSI_IBMVFC tristate "IBM Virtual FC support" depends on PPC_PSERIES && SCSI diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile index 862ab4efad61..d5397987e731 100644 --- a/drivers/scsi/Makefile +++ b/drivers/scsi/Makefile @@ -128,6 +128,7 @@ obj-$(CONFIG_SCSI_SNI_53C710) += 53c700.o sni_53c710.o obj-$(CONFIG_SCSI_NSP32) += nsp32.o obj-$(CONFIG_SCSI_IPR) += ipr.o obj-$(CONFIG_SCSI_IBMVSCSI) += ibmvscsi/ +obj-$(CONFIG_SCSI_IBMVSCSIS) += ibmvscsi_tgt/ obj-$(CONFIG_SCSI_IBMVFC) += ibmvscsi/ obj-$(CONFIG_SCSI_HPTIOP) += hptiop.o obj-$(CONFIG_SCSI_STEX) += stex.o diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h index 8fae03215a85..5c70a52ad346 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.h +++ b/drivers/scsi/ibmvscsi/ibmvfc.h @@ -26,7 +26,7 @@ #include <linux/list.h> #include <linux/types.h> -#include "viosrp.h" +#include <scsi/viosrp.h> #define IBMVFC_NAME "ibmvfc" #define IBMVFC_DRIVER_VERSION "1.0.11" diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.h b/drivers/scsi/ibmvscsi/ibmvscsi.h index 1067367395cd..e0f6c3aeb4ee 100644 --- a/drivers/scsi/ibmvscsi/ibmvscsi.h +++ b/drivers/scsi/ibmvscsi/ibmvscsi.h @@ -33,7 +33,7 @@ #include <linux/list.h> #include <linux/completion.h> #include <linux/interrupt.h> -#include "viosrp.h" +#include <scsi/viosrp.h> struct scsi_cmnd; struct Scsi_Host; diff --git a/drivers/scsi/ibmvscsi_tgt/Makefile b/drivers/scsi/ibmvscsi_tgt/Makefile new file mode 100644 index 000000000000..0c060ce64cb0 --- /dev/null +++ b/drivers/scsi/ibmvscsi_tgt/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_SCSI_IBMVSCSIS) += ibmvscsis.o + +ibmvscsis-y := libsrp.o ibmvscsi_tgt.o diff --git a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c new file mode 100644 index 000000000000..b29fef9d0f27 --- /dev/null +++ b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c @@ -0,0 +1,4087 @@ +/******************************************************************************* + * IBM Virtual SCSI Target Driver + * Copyright (C) 2003-2005 Dave Boutcher (boutcher@us.ibm.com) IBM Corp. + * Santiago Leon (santil@us.ibm.com) IBM Corp. + * Linda Xie (lxie@us.ibm.com) IBM Corp. + * + * Copyright (C) 2005-2011 FUJITA Tomonori <tomof@acm.org> + * Copyright (C) 2010 Nicholas A. Bellinger <nab@kernel.org> + * + * Authors: Bryant G. Ly <bryantly@linux.vnet.ibm.com> + * Authors: Michael Cyr <mikecyr@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + ****************************************************************************/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/string.h> + +#include <target/target_core_base.h> +#include <target/target_core_fabric.h> + +#include <asm/hvcall.h> +#include <asm/vio.h> + +#include <scsi/viosrp.h> + +#include "ibmvscsi_tgt.h" + +#define IBMVSCSIS_VERSION "v0.2" + +#define INITIAL_SRP_LIMIT 800 +#define DEFAULT_MAX_SECTORS 256 + +static uint max_vdma_size = MAX_H_COPY_RDMA; + +static char system_id[SYS_ID_NAME_LEN] = ""; +static char partition_name[PARTITION_NAMELEN] = "UNKNOWN"; +static uint partition_number = -1; + +/* Adapter list and lock to control it */ +static DEFINE_SPINLOCK(ibmvscsis_dev_lock); +static LIST_HEAD(ibmvscsis_dev_list); + +static long ibmvscsis_parse_command(struct scsi_info *vscsi, + struct viosrp_crq *crq); + +static void ibmvscsis_adapter_idle(struct scsi_info *vscsi); + +static void ibmvscsis_determine_resid(struct se_cmd *se_cmd, + struct srp_rsp *rsp) +{ + u32 residual_count = se_cmd->residual_count; + + if (!residual_count) + return; + + if (se_cmd->se_cmd_flags & SCF_UNDERFLOW_BIT) { + if (se_cmd->data_direction == DMA_TO_DEVICE) { + /* residual data from an underflow write */ + rsp->flags = SRP_RSP_FLAG_DOUNDER; + rsp->data_out_res_cnt = cpu_to_be32(residual_count); + } else if (se_cmd->data_direction == DMA_FROM_DEVICE) { + /* residual data from an underflow read */ + rsp->flags = SRP_RSP_FLAG_DIUNDER; + rsp->data_in_res_cnt = cpu_to_be32(residual_count); + } + } else if (se_cmd->se_cmd_flags & SCF_OVERFLOW_BIT) { + if (se_cmd->data_direction == DMA_TO_DEVICE) { + /* residual data from an overflow write */ + rsp->flags = SRP_RSP_FLAG_DOOVER; + rsp->data_out_res_cnt = cpu_to_be32(residual_count); + } else if (se_cmd->data_direction == DMA_FROM_DEVICE) { + /* residual data from an overflow read */ + rsp->flags = SRP_RSP_FLAG_DIOVER; + rsp->data_in_res_cnt = cpu_to_be32(residual_count); + } + } +} + +/** + * connection_broken() - Determine if the connection to the client is good + * @vscsi: Pointer to our adapter structure + * + * This function attempts to send a ping MAD to the client. If the call to + * queue the request returns H_CLOSED then the connection has been broken + * and the function returns TRUE. + * + * EXECUTION ENVIRONMENT: + * Interrupt or Process environment + */ +static bool connection_broken(struct scsi_info *vscsi) +{ + struct viosrp_crq *crq; + u64 buffer[2] = { 0, 0 }; + long h_return_code; + bool rc = false; + + /* create a PING crq */ + crq = (struct viosrp_crq *)&buffer; + crq->valid = VALID_CMD_RESP_EL; + crq->format = MESSAGE_IN_CRQ; + crq->status = PING; + + h_return_code = h_send_crq(vscsi->dds.unit_id, + cpu_to_be64(buffer[MSG_HI]), + cpu_to_be64(buffer[MSG_LOW])); + + pr_debug("connection_broken: rc %ld\n", h_return_code); + + if (h_return_code == H_CLOSED) + rc = true; + + return rc; +} + +/** + * ibmvscsis_unregister_command_q() - Helper Function-Unregister Command Queue + * @vscsi: Pointer to our adapter structure + * + * This function calls h_free_q then frees the interrupt bit etc. + * It must release the lock before doing so because of the time it can take + * for h_free_crq in PHYP + * NOTE: the caller must make sure that state and or flags will prevent + * interrupt handler from scheduling work. + * NOTE: anyone calling this function may need to set the CRQ_CLOSED flag + * we can't do it here, because we don't have the lock + * + * EXECUTION ENVIRONMENT: + * Process level + */ +static long ibmvscsis_unregister_command_q(struct scsi_info *vscsi) +{ + long qrc; + long rc = ADAPT_SUCCESS; + int ticks = 0; + + do { + qrc = h_free_crq(vscsi->dds.unit_id); + switch (qrc) { + case H_SUCCESS: + break; + + case H_HARDWARE: + case H_PARAMETER: + dev_err(&vscsi->dev, "unregister_command_q: error from h_free_crq %ld\n", + qrc); + rc = ERROR; + break; + + case H_BUSY: + case H_LONG_BUSY_ORDER_1_MSEC: + /* msleep not good for small values */ + usleep_range(1000, 2000); + ticks += 1; + break; + case H_LONG_BUSY_ORDER_10_MSEC: + usleep_range(10000, 20000); + ticks += 10; + break; + case H_LONG_BUSY_ORDER_100_MSEC: + msleep(100); + ticks += 100; + break; + case H_LONG_BUSY_ORDER_1_SEC: + ssleep(1); + ticks += 1000; + break; + case H_LONG_BUSY_ORDER_10_SEC: + ssleep(10); + ticks += 10000; + break; + case H_LONG_BUSY_ORDER_100_SEC: + ssleep(100); + ticks += 100000; + break; + default: + dev_err(&vscsi->dev, "unregister_command_q: unknown error %ld from h_free_crq\n", + qrc); + rc = ERROR; + break; + } + + /* + * dont wait more then 300 seconds + * ticks are in milliseconds more or less + */ + if (ticks > 300000 && qrc != H_SUCCESS) { + rc = ERROR; + dev_err(&vscsi->dev, "Excessive wait for h_free_crq\n"); + } + } while (qrc != H_SUCCESS && rc == ADAPT_SUCCESS); + + pr_debug("Freeing CRQ: phyp rc %ld, rc %ld\n", qrc, rc); + + return rc; +} + +/** + * ibmvscsis_delete_client_info() - Helper function to Delete Client Info + * @vscsi: Pointer to our adapter structure + * @client_closed: True if client closed its queue + * + * Deletes information specific to the client when the client goes away + * + * EXECUTION ENVIRONMENT: + * Interrupt or Process + */ +static void ibmvscsis_delete_client_info(struct scsi_info *vscsi, + bool client_closed) +{ + vscsi->client_cap = 0; + + /* + * Some things we don't want to clear if we're closing the queue, + * because some clients don't resend the host handshake when they + * get a transport event. + */ + if (client_closed) + vscsi->client_data.os_type = 0; +} + +/** + * ibmvscsis_free_command_q() - Free Command Queue + * @vscsi: Pointer to our adapter structure + * + * This function calls unregister_command_q, then clears interrupts and + * any pending interrupt acknowledgments associated with the command q. + * It also clears memory if there is no error. + * + * PHYP did not meet the PAPR architecture so that we must give up the + * lock. This causes a timing hole regarding state change. To close the + * hole this routine does accounting on any change that occurred during + * the time the lock is not held. + * NOTE: must give up and then acquire the interrupt lock, the caller must + * make sure that state and or flags will prevent interrupt handler from + * scheduling work. + * + * EXECUTION ENVIRONMENT: + * Process level, interrupt lock is held + */ +static long ibmvscsis_free_command_q(struct scsi_info *vscsi) +{ + int bytes; + u32 flags_under_lock; + u16 state_under_lock; + long rc = ADAPT_SUCCESS; + + if (!(vscsi->flags & CRQ_CLOSED)) { + vio_disable_interrupts(vscsi->dma_dev); + + state_under_lock = vscsi->new_state; + flags_under_lock = vscsi->flags; + vscsi->phyp_acr_state = 0; + vscsi->phyp_acr_flags = 0; + + spin_unlock_bh(&vscsi->intr_lock); + rc = ibmvscsis_unregister_command_q(vscsi); + spin_lock_bh(&vscsi->intr_lock); + + if (state_under_lock != vscsi->new_state) + vscsi->phyp_acr_state = vscsi->new_state; + + vscsi->phyp_acr_flags = ((~flags_under_lock) & vscsi->flags); + + if (rc == ADAPT_SUCCESS) { + bytes = vscsi->cmd_q.size * PAGE_SIZE; + memset(vscsi->cmd_q.base_addr, 0, bytes); + vscsi->cmd_q.index = 0; + vscsi->flags |= CRQ_CLOSED; + + ibmvscsis_delete_client_info(vscsi, false); + } + + pr_debug("free_command_q: flags 0x%x, state 0x%hx, acr_flags 0x%x, acr_state 0x%hx\n", + vscsi->flags, vscsi->state, vscsi->phyp_acr_flags, + vscsi->phyp_acr_state); + } + return rc; +} + +/** + * ibmvscsis_cmd_q_dequeue() - Get valid Command element + * @mask: Mask to use in case index wraps + * @current_index: Current index into command queue + * @base_addr: Pointer to start of command queue + * + * Returns a pointer to a valid command element or NULL, if the command + * queue is empty + * + * EXECUTION ENVIRONMENT: + * Interrupt environment, interrupt lock held + */ +static struct viosrp_crq *ibmvscsis_cmd_q_dequeue(uint mask, + uint *current_index, + struct viosrp_crq *base_addr) +{ + struct viosrp_crq *ptr; + + ptr = base_addr + *current_index; + + if (ptr->valid) { + *current_index = (*current_index + 1) & mask; + dma_rmb(); + } else { + ptr = NULL; + } + + return ptr; +} + +/** + * ibmvscsis_send_init_message() - send initialize message to the client + * @vscsi: Pointer to our adapter structure + * @format: Which Init Message format to send + * + * EXECUTION ENVIRONMENT: + * Interrupt environment interrupt lock held + */ +static long ibmvscsis_send_init_message(struct scsi_info *vscsi, u8 format) +{ + struct viosrp_crq *crq; + u64 buffer[2] = { 0, 0 }; + long rc; + + crq = (struct viosrp_crq *)&buffer; + crq->valid = VALID_INIT_MSG; + crq->format = format; + rc = h_send_crq(vscsi->dds.unit_id, cpu_to_be64(buffer[MSG_HI]), + cpu_to_be64(buffer[MSG_LOW])); + + return rc; +} + +/** + * ibmvscsis_check_init_msg() - Check init message valid + * @vscsi: Pointer to our adapter structure + * @format: Pointer to return format of Init Message, if any. + * Set to UNUSED_FORMAT if no Init Message in queue. + * + * Checks if an initialize message was queued by the initiatior + * after the queue was created and before the interrupt was enabled. + * + * EXECUTION ENVIRONMENT: + * Process level only, interrupt lock held + */ +static long ibmvscsis_check_init_msg(struct scsi_info *vscsi, uint *format) +{ + struct viosrp_crq *crq; + long rc = ADAPT_SUCCESS; + + crq = ibmvscsis_cmd_q_dequeue(vscsi->cmd_q.mask, &vscsi->cmd_q.index, + vscsi->cmd_q.base_addr); + if (!crq) { + *format = (uint)UNUSED_FORMAT; + } else if (crq->valid == VALID_INIT_MSG && crq->format == INIT_MSG) { + *format = (uint)INIT_MSG; + crq->valid = INVALIDATE_CMD_RESP_EL; + dma_rmb(); + + /* + * the caller has ensured no initialize message was + * sent after the queue was + * created so there should be no other message on the queue. + */ + crq = ibmvscsis_cmd_q_dequeue(vscsi->cmd_q.mask, + &vscsi->cmd_q.index, + vscsi->cmd_q.base_addr); + if (crq) { + *format = (uint)(crq->format); + rc = ERROR; + crq->valid = INVALIDATE_CMD_RESP_EL; + dma_rmb(); + } + } else { + *format = (uint)(crq->format); + rc = ERROR; + crq->valid = INVALIDATE_CMD_RESP_EL; + dma_rmb(); + } + + return rc; +} + +/** + * ibmvscsis_establish_new_q() - Establish new CRQ queue + * @vscsi: Pointer to our adapter structure + * @new_state: New state being established after resetting the queue + * + * Must be called with interrupt lock held. + */ +static long ibmvscsis_establish_new_q(struct scsi_info *vscsi, uint new_state) +{ + long rc = ADAPT_SUCCESS; + uint format; + + vscsi->flags &= PRESERVE_FLAG_FIELDS; + vscsi->rsp_q_timer.timer_pops = 0; + vscsi->debit = 0; + vscsi->credit = 0; + + rc = vio_enable_interrupts(vscsi->dma_dev); + if (rc) { + pr_warn("reset_queue: failed to enable interrupts, rc %ld\n", + rc); + return rc; + } + + rc = ibmvscsis_check_init_msg(vscsi, &format); + if (rc) { + dev_err(&vscsi->dev, "reset_queue: check_init_msg failed, rc %ld\n", + rc); + return rc; + } + + if (format == UNUSED_FORMAT && new_state == WAIT_CONNECTION) { + rc = ibmvscsis_send_init_message(vscsi, INIT_MSG); + switch (rc) { + case H_SUCCESS: + case H_DROPPED: + case H_CLOSED: + rc = ADAPT_SUCCESS; + break; + + case H_PARAMETER: + case H_HARDWARE: + break; + + default: + vscsi->state = UNDEFINED; + rc = H_HARDWARE; + break; + } + } + + return rc; +} + +/** + * ibmvscsis_reset_queue() - Reset CRQ Queue + * @vscsi: Pointer to our adapter structure + * @new_state: New state to establish after resetting the queue + * + * This function calls h_free_q and then calls h_reg_q and does all + * of the bookkeeping to get us back to where we can communicate. + * + * Actually, we don't always call h_free_crq. A problem was discovered + * where one partition would close and reopen his queue, which would + * cause his partner to get a transport event, which would cause him to + * close and reopen his queue, which would cause the original partition + * to get a transport event, etc., etc. To prevent this, we don't + * actually close our queue if the client initiated the reset, (i.e. + * either we got a transport event or we have detected that the client's + * queue is gone) + * + * EXECUTION ENVIRONMENT: + * Process environment, called with interrupt lock held + */ +static void ibmvscsis_reset_queue(struct scsi_info *vscsi, uint new_state) +{ + int bytes; + long rc = ADAPT_SUCCESS; + + pr_debug("reset_queue: flags 0x%x\n", vscsi->flags); + + /* don't reset, the client did it for us */ + if (vscsi->flags & (CLIENT_FAILED | TRANS_EVENT)) { + vscsi->flags &= PRESERVE_FLAG_FIELDS; + vscsi->rsp_q_timer.timer_pops = 0; + vscsi->debit = 0; + vscsi->credit = 0; + vscsi->state = new_state; + vio_enable_interrupts(vscsi->dma_dev); + } else { + rc = ibmvscsis_free_command_q(vscsi); + if (rc == ADAPT_SUCCESS) { + vscsi->state = new_state; + + bytes = vscsi->cmd_q.size * PAGE_SIZE; + rc = h_reg_crq(vscsi->dds.unit_id, + vscsi->cmd_q.crq_token, bytes); + if (rc == H_CLOSED || rc == H_SUCCESS) { + rc = ibmvscsis_establish_new_q(vscsi, + new_state); + } + + if (rc != ADAPT_SUCCESS) { + pr_debug("reset_queue: reg_crq rc %ld\n", rc); + + vscsi->state = ERR_DISCONNECTED; + vscsi->flags |= RESPONSE_Q_DOWN; + ibmvscsis_free_command_q(vscsi); + } + } else { + vscsi->state = ERR_DISCONNECTED; + vscsi->flags |= RESPONSE_Q_DOWN; + } + } +} + +/** + * ibmvscsis_free_cmd_resources() - Free command resources + * @vscsi: Pointer to our adapter structure + * @cmd: Command which is not longer in use + * + * Must be called with interrupt lock held. + */ +static void ibmvscsis_free_cmd_resources(struct scsi_info *vscsi, + struct ibmvscsis_cmd *cmd) +{ + struct iu_entry *iue = cmd->iue; + + switch (cmd->type) { + case TASK_MANAGEMENT: + case SCSI_CDB: + /* + * When the queue goes down this value is cleared, so it + * cannot be cleared in this general purpose function. + */ + if (vscsi->debit) + vscsi->debit -= 1; + break; + case ADAPTER_MAD: + vscsi->flags &= ~PROCESSING_MAD; + break; + case UNSET_TYPE: + break; + default: + dev_err(&vscsi->dev, "free_cmd_resources unknown type %d\n", + cmd->type); + break; + } + + cmd->iue = NULL; + list_add_tail(&cmd->list, &vscsi->free_cmd); + srp_iu_put(iue); + + if (list_empty(&vscsi->active_q) && list_empty(&vscsi->schedule_q) && + list_empty(&vscsi->waiting_rsp) && (vscsi->flags & WAIT_FOR_IDLE)) { + vscsi->flags &= ~WAIT_FOR_IDLE; + complete(&vscsi->wait_idle); + } +} + +/** + * ibmvscsis_disconnect() - Helper function to disconnect + * @work: Pointer to work_struct, gives access to our adapter structure + * + * An error has occurred or the driver received a Transport event, + * and the driver is requesting that the command queue be de-registered + * in a safe manner. If there is no outstanding I/O then we can stop the + * queue. If we are restarting the queue it will be reflected in the + * the state of the adapter. + * + * EXECUTION ENVIRONMENT: + * Process environment + */ +static void ibmvscsis_disconnect(struct work_struct *work) +{ + struct scsi_info *vscsi = container_of(work, struct scsi_info, + proc_work); + u16 new_state; + bool wait_idle = false; + long rc = ADAPT_SUCCESS; + + spin_lock_bh(&vscsi->intr_lock); + new_state = vscsi->new_state; + vscsi->new_state = 0; + + pr_debug("disconnect: flags 0x%x, state 0x%hx\n", vscsi->flags, + vscsi->state); + + /* + * check which state we are in and see if we + * should transitition to the new state + */ + switch (vscsi->state) { + /* Should never be called while in this state. */ + case NO_QUEUE: + /* + * Can never transition from this state; + * igonore errors and logout. + */ + case UNCONFIGURING: + break; + + /* can transition from this state to UNCONFIGURING */ + case ERR_DISCONNECT: + if (new_state == UNCONFIGURING) + vscsi->state = new_state; + break; + + /* + * Can transition from this state to to unconfiguring + * or err disconnect. + */ + case ERR_DISCONNECT_RECONNECT: + switch (new_state) { + case UNCONFIGURING: + case ERR_DISCONNECT: + vscsi->state = new_state; + break; + + case WAIT_IDLE: + break; + default: + break; + } + break; + + /* can transition from this state to UNCONFIGURING */ + case ERR_DISCONNECTED: + if (new_state == UNCONFIGURING) + vscsi->state = new_state; + break; + + /* + * If this is a transition into an error state. + * a client is attempting to establish a connection + * and has violated the RPA protocol. + * There can be nothing pending on the adapter although + * there can be requests in the command queue. + */ + case WAIT_ENABLED: + case PART_UP_WAIT_ENAB: + switch (new_state) { + case ERR_DISCONNECT: + vscsi->flags |= RESPONSE_Q_DOWN; + vscsi->state = new_state; + vscsi->flags &= ~(SCHEDULE_DISCONNECT | + DISCONNECT_SCHEDULED); + ibmvscsis_free_command_q(vscsi); + break; + case ERR_DISCONNECT_RECONNECT: + ibmvscsis_reset_queue(vscsi, WAIT_ENABLED); + break; + + /* should never happen */ + case WAIT_IDLE: + rc = ERROR; + dev_err(&vscsi->dev, "disconnect: invalid state %d for WAIT_IDLE\n", + vscsi->state); + break; + } + break; + + case WAIT_IDLE: + switch (new_state) { + case ERR_DISCONNECT: + case ERR_DISCONNECT_RECONNECT: + vscsi->state = new_state; + break; + } + break; + + /* + * Initiator has not done a successful srp login + * or has done a successful srp logout ( adapter was not + * busy). In the first case there can be responses queued + * waiting for space on the initiators response queue (MAD) + * The second case the adapter is idle. Assume the worse case, + * i.e. the second case. + */ + case WAIT_CONNECTION: + case CONNECTED: + case SRP_PROCESSING: + wait_idle = true; + vscsi->state = new_state; + break; + + /* can transition from this state to UNCONFIGURING */ + case UNDEFINED: + if (new_state == UNCONFIGURING) + vscsi->state = new_state; + break; + default: + break; + } + + if (wait_idle) { + pr_debug("disconnect start wait, active %d, sched %d\n", + (int)list_empty(&vscsi->active_q), + (int)list_empty(&vscsi->schedule_q)); + if (!list_empty(&vscsi->active_q) || + !list_empty(&vscsi->schedule_q)) { + vscsi->flags |= WAIT_FOR_IDLE; + pr_debug("disconnect flags 0x%x\n", vscsi->flags); + /* + * This routine is can not be called with the interrupt + * lock held. + */ + spin_unlock_bh(&vscsi->intr_lock); + wait_for_completion(&vscsi->wait_idle); + spin_lock_bh(&vscsi->intr_lock); + } + pr_debug("disconnect stop wait\n"); + + ibmvscsis_adapter_idle(vscsi); + } + + spin_unlock_bh(&vscsi->intr_lock); +} + +/** + * ibmvscsis_post_disconnect() - Schedule the disconnect + * @vscsi: Pointer to our adapter structure + * @new_state: State to move to after disconnecting + * @flag_bits: Flags to turn on in adapter structure + * + * If it's already been scheduled, then see if we need to "upgrade" + * the new state (if the one passed in is more "severe" than the + * previous one). + * + * PRECONDITION: + * interrupt lock is held + */ +static void ibmvscsis_post_disconnect(struct scsi_info *vscsi, uint new_state, + uint flag_bits) +{ + uint state; + + /* check the validity of the new state */ + switch (new_state) { + case UNCONFIGURING: + case ERR_DISCONNECT: + case ERR_DISCONNECT_RECONNECT: + case WAIT_IDLE: + break; + + default: + dev_err(&vscsi->dev, "post_disconnect: Invalid new state %d\n", + new_state); + return; + } + + vscsi->flags |= flag_bits; + + pr_debug("post_disconnect: new_state 0x%x, flag_bits 0x%x, vscsi->flags 0x%x, state %hx\n", + new_state, flag_bits, vscsi->flags, vscsi->state); + + if (!(vscsi->flags & (DISCONNECT_SCHEDULED | SCHEDULE_DISCONNECT))) { + vscsi->flags |= SCHEDULE_DISCONNECT; + vscsi->new_state = new_state; + + INIT_WORK(&vscsi->proc_work, ibmvscsis_disconnect); + (void)queue_work(vscsi->work_q, &vscsi->proc_work); + } else { + if (vscsi->new_state) + state = vscsi->new_state; + else + state = vscsi->state; + + switch (state) { + case NO_QUEUE: + case UNCONFIGURING: + break; + + case ERR_DISCONNECTED: + case ERR_DISCONNECT: + case UNDEFINED: + if (new_state == UNCONFIGURING) + vscsi->new_state = new_state; + break; + + case ERR_DISCONNECT_RECONNECT: + switch (new_state) { + case UNCONFIGURING: + case ERR_DISCONNECT: + vscsi->new_state = new_state; + break; + default: + break; + } + break; + + case WAIT_ENABLED: + case PART_UP_WAIT_ENAB: + case WAIT_IDLE: + case WAIT_CONNECTION: + case CONNECTED: + case SRP_PROCESSING: + vscsi->new_state = new_state; + break; + + default: + break; + } + } + + pr_debug("Leaving post_disconnect: flags 0x%x, new_state 0x%x\n", + vscsi->flags, vscsi->new_state); +} + +/** + * ibmvscsis_trans_event() - Handle a Transport Event + * @vscsi: Pointer to our adapter structure + * @crq: Pointer to CRQ entry containing the Transport Event + * + * Do the logic to close the I_T nexus. This function may not + * behave to specification. + * + * EXECUTION ENVIRONMENT: + * Interrupt, interrupt lock held + */ +static long ibmvscsis_trans_event(struct scsi_info *vscsi, + struct viosrp_crq *crq) +{ + long rc = ADAPT_SUCCESS; + + pr_debug("trans_event: format %d, flags 0x%x, state 0x%hx\n", + (int)crq->format, vscsi->flags, vscsi->state); + + switch (crq->format) { + case MIGRATED: + case PARTNER_FAILED: + case PARTNER_DEREGISTER: + ibmvscsis_delete_client_info(vscsi, true); + break; + + default: + rc = ERROR; + dev_err(&vscsi->dev, "trans_event: invalid format %d\n", + (uint)crq->format); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT, + RESPONSE_Q_DOWN); + break; + } + + if (rc == ADAPT_SUCCESS) { + switch (vscsi->state) { + case NO_QUEUE: + case ERR_DISCONNECTED: + case UNDEFINED: + break; + + case UNCONFIGURING: + vscsi->flags |= (RESPONSE_Q_DOWN | TRANS_EVENT); + break; + + case WAIT_ENABLED: + break; + + case WAIT_CONNECTION: + break; + + case CONNECTED: + ibmvscsis_post_disconnect(vscsi, WAIT_IDLE, + (RESPONSE_Q_DOWN | + TRANS_EVENT)); + break; + + case PART_UP_WAIT_ENAB: + vscsi->state = WAIT_ENABLED; + break; + + case SRP_PROCESSING: + if ((vscsi->debit > 0) || + !list_empty(&vscsi->schedule_q) || + !list_empty(&vscsi->waiting_rsp) || + !list_empty(&vscsi->active_q)) { + pr_debug("debit %d, sched %d, wait %d, active %d\n", + vscsi->debit, + (int)list_empty(&vscsi->schedule_q), + (int)list_empty(&vscsi->waiting_rsp), + (int)list_empty(&vscsi->active_q)); + pr_warn("connection lost with outstanding work\n"); + } else { + pr_debug("trans_event: SRP Processing, but no outstanding work\n"); + } + + ibmvscsis_post_disconnect(vscsi, WAIT_IDLE, + (RESPONSE_Q_DOWN | + TRANS_EVENT)); + break; + + case ERR_DISCONNECT: + case ERR_DISCONNECT_RECONNECT: + case WAIT_IDLE: + vscsi->flags |= (RESPONSE_Q_DOWN | TRANS_EVENT); + break; + } + } + + rc = vscsi->flags & SCHEDULE_DISCONNECT; + + pr_debug("Leaving trans_event: flags 0x%x, state 0x%hx, rc %ld\n", + vscsi->flags, vscsi->state, rc); + + return rc; +} + +/** + * ibmvscsis_poll_cmd_q() - Poll Command Queue + * @vscsi: Pointer to our adapter structure + * + * Called to handle command elements that may have arrived while + * interrupts were disabled. + * + * EXECUTION ENVIRONMENT: + * intr_lock must be held + */ +static void ibmvscsis_poll_cmd_q(struct scsi_info *vscsi) +{ + struct viosrp_crq *crq; + long rc; + bool ack = true; + volatile u8 valid; + + pr_debug("poll_cmd_q: flags 0x%x, state 0x%hx, q index %ud\n", + vscsi->flags, vscsi->state, vscsi->cmd_q.index); + + rc = vscsi->flags & SCHEDULE_DISCONNECT; + crq = vscsi->cmd_q.base_addr + vscsi->cmd_q.index; + valid = crq->valid; + dma_rmb(); + + while (valid) { +poll_work: + vscsi->cmd_q.index = + (vscsi->cmd_q.index + 1) & vscsi->cmd_q.mask; + + if (!rc) { + rc = ibmvscsis_parse_command(vscsi, crq); + } else { + if ((uint)crq->valid == VALID_TRANS_EVENT) { + /* + * must service the transport layer events even + * in an error state, dont break out until all + * the consecutive transport events have been + * processed + */ + rc = ibmvscsis_trans_event(vscsi, crq); + } else if (vscsi->flags & TRANS_EVENT) { + /* + * if a tranport event has occurred leave + * everything but transport events on the queue + */ + pr_debug("poll_cmd_q, ignoring\n"); + + /* + * need to decrement the queue index so we can + * look at the elment again + */ + if (vscsi->cmd_q.index) + vscsi->cmd_q.index -= 1; + else + /* + * index is at 0 it just wrapped. + * have it index last element in q + */ + vscsi->cmd_q.index = vscsi->cmd_q.mask; + break; + } + } + + crq->valid = INVALIDATE_CMD_RESP_EL; + + crq = vscsi->cmd_q.base_addr + vscsi->cmd_q.index; + valid = crq->valid; + dma_rmb(); + } + + if (!rc) { + if (ack) { + vio_enable_interrupts(vscsi->dma_dev); + ack = false; + pr_debug("poll_cmd_q, reenabling interrupts\n"); + } + valid = crq->valid; + dma_rmb(); + if (valid) + goto poll_work; + } + + pr_debug("Leaving poll_cmd_q: rc %ld\n", rc); +} + +/** + * ibmvscsis_free_cmd_qs() - Free elements in queue + * @vscsi: Pointer to our adapter structure + * + * Free all of the elements on all queues that are waiting for + * whatever reason. + * + * PRECONDITION: + * Called with interrupt lock held + */ +static void ibmvscsis_free_cmd_qs(struct scsi_info *vscsi) +{ + struct ibmvscsis_cmd *cmd, *nxt; + + pr_debug("free_cmd_qs: waiting_rsp empty %d, timer starter %d\n", + (int)list_empty(&vscsi->waiting_rsp), + vscsi->rsp_q_timer.started); + + list_for_each_entry_safe(cmd, nxt, &vscsi->waiting_rsp, list) { + list_del(&cmd->list); + ibmvscsis_free_cmd_resources(vscsi, cmd); + } +} + +/** + * ibmvscsis_get_free_cmd() - Get free command from list + * @vscsi: Pointer to our adapter structure + * + * Must be called with interrupt lock held. + */ +static struct ibmvscsis_cmd *ibmvscsis_get_free_cmd(struct scsi_info *vscsi) +{ + struct ibmvscsis_cmd *cmd = NULL; + struct iu_entry *iue; + + iue = srp_iu_get(&vscsi->target); + if (iue) { + cmd = list_first_entry_or_null(&vscsi->free_cmd, + struct ibmvscsis_cmd, list); + if (cmd) { + list_del(&cmd->list); + cmd->iue = iue; + cmd->type = UNSET_TYPE; + memset(&cmd->se_cmd, 0, sizeof(cmd->se_cmd)); + } else { + srp_iu_put(iue); + } + } + + return cmd; +} + +/** + * ibmvscsis_adapter_idle() - Helper function to handle idle adapter + * @vscsi: Pointer to our adapter structure + * + * This function is called when the adapter is idle when the driver + * is attempting to clear an error condition. + * The adapter is considered busy if any of its cmd queues + * are non-empty. This function can be invoked + * from the off level disconnect function. + * + * EXECUTION ENVIRONMENT: + * Process environment called with interrupt lock held + */ +static void ibmvscsis_adapter_idle(struct scsi_info *vscsi) +{ + int free_qs = false; + + pr_debug("adapter_idle: flags 0x%x, state 0x%hx\n", vscsi->flags, + vscsi->state); + + /* Only need to free qs if we're disconnecting from client */ + if (vscsi->state != WAIT_CONNECTION || vscsi->flags & TRANS_EVENT) + free_qs = true; + + switch (vscsi->state) { + case ERR_DISCONNECT_RECONNECT: + ibmvscsis_reset_queue(vscsi, WAIT_CONNECTION); + pr_debug("adapter_idle, disc_rec: flags 0x%x\n", vscsi->flags); + break; + + case ERR_DISCONNECT: + ibmvscsis_free_command_q(vscsi); + vscsi->flags &= ~DISCONNECT_SCHEDULED; + vscsi->flags |= RESPONSE_Q_DOWN; + vscsi->state = ERR_DISCONNECTED; + pr_debug("adapter_idle, disc: flags 0x%x, state 0x%hx\n", + vscsi->flags, vscsi->state); + break; + + case WAIT_IDLE: + vscsi->rsp_q_timer.timer_pops = 0; + vscsi->debit = 0; + vscsi->credit = 0; + if (vscsi->flags & TRANS_EVENT) { + vscsi->state = WAIT_CONNECTION; + vscsi->flags &= PRESERVE_FLAG_FIELDS; + } else { + vscsi->state = CONNECTED; + vscsi->flags &= ~DISCONNECT_SCHEDULED; + } + + pr_debug("adapter_idle, wait: flags 0x%x, state 0x%hx\n", + vscsi->flags, vscsi->state); + ibmvscsis_poll_cmd_q(vscsi); + break; + + case ERR_DISCONNECTED: + vscsi->flags &= ~DISCONNECT_SCHEDULED; + pr_debug("adapter_idle, disconnected: flags 0x%x, state 0x%hx\n", + vscsi->flags, vscsi->state); + break; + + default: + dev_err(&vscsi->dev, "adapter_idle: in invalid state %d\n", + vscsi->state); + break; + } + + if (free_qs) + ibmvscsis_free_cmd_qs(vscsi); + + /* + * There is a timing window where we could lose a disconnect request. + * The known path to this window occurs during the DISCONNECT_RECONNECT + * case above: reset_queue calls free_command_q, which will release the + * interrupt lock. During that time, a new post_disconnect call can be + * made with a "more severe" state (DISCONNECT or UNCONFIGURING). + * Because the DISCONNECT_SCHEDULED flag is already set, post_disconnect + * will only set the new_state. Now free_command_q reacquires the intr + * lock and clears the DISCONNECT_SCHEDULED flag (using PRESERVE_FLAG_ + * FIELDS), and the disconnect is lost. This is particularly bad when + * the new disconnect was for UNCONFIGURING, since the unconfigure hangs + * forever. + * Fix is that free command queue sets acr state and acr flags if there + * is a change under the lock + * note free command queue writes to this state it clears it + * before releasing the lock, different drivers call the free command + * queue different times so dont initialize above + */ + if (vscsi->phyp_acr_state != 0) { + /* + * set any bits in flags that may have been cleared by + * a call to free command queue in switch statement + * or reset queue + */ + vscsi->flags |= vscsi->phyp_acr_flags; + ibmvscsis_post_disconnect(vscsi, vscsi->phyp_acr_state, 0); + vscsi->phyp_acr_state = 0; + vscsi->phyp_acr_flags = 0; + + pr_debug("adapter_idle: flags 0x%x, state 0x%hx, acr_flags 0x%x, acr_state 0x%hx\n", + vscsi->flags, vscsi->state, vscsi->phyp_acr_flags, + vscsi->phyp_acr_state); + } + + pr_debug("Leaving adapter_idle: flags 0x%x, state 0x%hx, new_state 0x%x\n", + vscsi->flags, vscsi->state, vscsi->new_state); +} + +/** + * ibmvscsis_copy_crq_packet() - Copy CRQ Packet + * @vscsi: Pointer to our adapter structure + * @cmd: Pointer to command element to use to process the request + * @crq: Pointer to CRQ entry containing the request + * + * Copy the srp information unit from the hosted + * partition using remote dma + * + * EXECUTION ENVIRONMENT: + * Interrupt, interrupt lock held + */ +static long ibmvscsis_copy_crq_packet(struct scsi_info *vscsi, + struct ibmvscsis_cmd *cmd, + struct viosrp_crq *crq) +{ + struct iu_entry *iue = cmd->iue; + long rc = 0; + u16 len; + + len = be16_to_cpu(crq->IU_length); + if ((len > SRP_MAX_IU_LEN) || (len == 0)) { + dev_err(&vscsi->dev, "copy_crq: Invalid len %d passed", len); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0); + return SRP_VIOLATION; + } + + rc = h_copy_rdma(len, vscsi->dds.window[REMOTE].liobn, + be64_to_cpu(crq->IU_data_ptr), + vscsi->dds.window[LOCAL].liobn, iue->sbuf->dma); + + switch (rc) { + case H_SUCCESS: + cmd->init_time = mftb(); + iue->remote_token = crq->IU_data_ptr; + iue->iu_len = len; + pr_debug("copy_crq: ioba 0x%llx, init_time 0x%llx\n", + be64_to_cpu(crq->IU_data_ptr), cmd->init_time); + break; + case H_PERMISSION: + if (connection_broken(vscsi)) + ibmvscsis_post_disconnect(vscsi, + ERR_DISCONNECT_RECONNECT, + (RESPONSE_Q_DOWN | + CLIENT_FAILED)); + else + ibmvscsis_post_disconnect(vscsi, + ERR_DISCONNECT_RECONNECT, 0); + + dev_err(&vscsi->dev, "copy_crq: h_copy_rdma failed, rc %ld\n", + rc); + break; + case H_DEST_PARM: + case H_SOURCE_PARM: + default: + dev_err(&vscsi->dev, "copy_crq: h_copy_rdma failed, rc %ld\n", + rc); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0); + break; + } + + return rc; +} + +/** + * ibmvscsis_adapter_info - Service an Adapter Info MAnagement Data gram + * @vscsi: Pointer to our adapter structure + * @iue: Information Unit containing the Adapter Info MAD request + * + * EXECUTION ENVIRONMENT: + * Interrupt adpater lock is held + */ +static long ibmvscsis_adapter_info(struct scsi_info *vscsi, + struct iu_entry *iue) +{ + struct viosrp_adapter_info *mad = &vio_iu(iue)->mad.adapter_info; + struct mad_adapter_info_data *info; + uint flag_bits = 0; + dma_addr_t token; + long rc; + + mad->common.status = cpu_to_be16(VIOSRP_MAD_SUCCESS); + + if (be16_to_cpu(mad->common.length) > sizeof(*info)) { + mad->common.status = cpu_to_be16(VIOSRP_MAD_FAILED); + return 0; + } + + info = dma_alloc_coherent(&vscsi->dma_dev->dev, sizeof(*info), &token, + GFP_KERNEL); + if (!info) { + dev_err(&vscsi->dev, "bad dma_alloc_coherent %p\n", + iue->target); + mad->common.status = cpu_to_be16(VIOSRP_MAD_FAILED); + return 0; + } + + /* Get remote info */ + rc = h_copy_rdma(be16_to_cpu(mad->common.length), + vscsi->dds.window[REMOTE].liobn, + be64_to_cpu(mad->buffer), + vscsi->dds.window[LOCAL].liobn, token); + + if (rc != H_SUCCESS) { + if (rc == H_PERMISSION) { + if (connection_broken(vscsi)) + flag_bits = (RESPONSE_Q_DOWN | CLIENT_FAILED); + } + pr_warn("adapter_info: h_copy_rdma from client failed, rc %ld\n", + rc); + pr_debug("adapter_info: ioba 0x%llx, flags 0x%x, flag_bits 0x%x\n", + be64_to_cpu(mad->buffer), vscsi->flags, flag_bits); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, + flag_bits); + goto free_dma; + } + + /* + * Copy client info, but ignore partition number, which we + * already got from phyp - unless we failed to get it from + * phyp (e.g. if we're running on a p5 system). + */ + if (vscsi->client_data.partition_number == 0) + vscsi->client_data.partition_number = + be32_to_cpu(info->partition_number); + strncpy(vscsi->client_data.srp_version, info->srp_version, + sizeof(vscsi->client_data.srp_version)); + strncpy(vscsi->client_data.partition_name, info->partition_name, + sizeof(vscsi->client_data.partition_name)); + vscsi->client_data.mad_version = be32_to_cpu(info->mad_version); + vscsi->client_data.os_type = be32_to_cpu(info->os_type); + + /* Copy our info */ + strncpy(info->srp_version, SRP_VERSION, + sizeof(info->srp_version)); + strncpy(info->partition_name, vscsi->dds.partition_name, + sizeof(info->partition_name)); + info->partition_number = cpu_to_be32(vscsi->dds.partition_num); + info->mad_version = cpu_to_be32(MAD_VERSION_1); + info->os_type = cpu_to_be32(LINUX); + memset(&info->port_max_txu[0], 0, sizeof(info->port_max_txu)); + info->port_max_txu[0] = cpu_to_be32(128 * PAGE_SIZE); + + dma_wmb(); + rc = h_copy_rdma(sizeof(*info), vscsi->dds.window[LOCAL].liobn, + token, vscsi->dds.window[REMOTE].liobn, + be64_to_cpu(mad->buffer)); + switch (rc) { + case H_SUCCESS: + break; + + case H_SOURCE_PARM: + case H_DEST_PARM: + case H_PERMISSION: + if (connection_broken(vscsi)) + flag_bits = (RESPONSE_Q_DOWN | CLIENT_FAILED); + default: + dev_err(&vscsi->dev, "adapter_info: h_copy_rdma to client failed, rc %ld\n", + rc); + ibmvscsis_post_disconnect(vscsi, + ERR_DISCONNECT_RECONNECT, + flag_bits); + break; + } + +free_dma: + dma_free_coherent(&vscsi->dma_dev->dev, sizeof(*info), info, token); + pr_debug("Leaving adapter_info, rc %ld\n", rc); + + return rc; +} + +/** + * ibmvscsis_cap_mad() - Service a Capabilities MAnagement Data gram + * @vscsi: Pointer to our adapter structure + * @iue: Information Unit containing the Capabilities MAD request + * + * NOTE: if you return an error from this routine you must be + * disconnecting or you will cause a hang + * + * EXECUTION ENVIRONMENT: + * Interrupt called with adapter lock held + */ +static int ibmvscsis_cap_mad(struct scsi_info *vscsi, struct iu_entry *iue) +{ + struct viosrp_capabilities *mad = &vio_iu(iue)->mad.capabilities; + struct capabilities *cap; + struct mad_capability_common *common; + dma_addr_t token; + u16 olen, len, status, min_len, cap_len; + u32 flag; + uint flag_bits = 0; + long rc = 0; + + olen = be16_to_cpu(mad->common.length); + /* + * struct capabilities hardcodes a couple capabilities after the + * header, but the capabilities can actually be in any order. + */ + min_len = offsetof(struct capabilities, migration); + if ((olen < min_len) || (olen > PAGE_SIZE)) { + pr_warn("cap_mad: invalid len %d\n", olen); + mad->common.status = cpu_to_be16(VIOSRP_MAD_FAILED); + return 0; + } + + cap = dma_alloc_coherent(&vscsi->dma_dev->dev, olen, &token, + GFP_KERNEL); + if (!cap) { + dev_err(&vscsi->dev, "bad dma_alloc_coherent %p\n", + iue->target); + mad->common.status = cpu_to_be16(VIOSRP_MAD_FAILED); + return 0; + } + rc = h_copy_rdma(olen, vscsi->dds.window[REMOTE].liobn, + be64_to_cpu(mad->buffer), + vscsi->dds.window[LOCAL].liobn, token); + if (rc == H_SUCCESS) { + strncpy(cap->name, dev_name(&vscsi->dma_dev->dev), + SRP_MAX_LOC_LEN); + + len = olen - min_len; + status = VIOSRP_MAD_SUCCESS; + common = (struct mad_capability_common *)&cap->migration; + + while ((len > 0) && (status == VIOSRP_MAD_SUCCESS) && !rc) { + pr_debug("cap_mad: len left %hd, cap type %d, cap len %hd\n", + len, be32_to_cpu(common->cap_type), + be16_to_cpu(common->length)); + + cap_len = be16_to_cpu(common->length); + if (cap_len > len) { + dev_err(&vscsi->dev, "cap_mad: cap len mismatch with total len\n"); + status = VIOSRP_MAD_FAILED; + break; + } + + if (cap_len == 0) { + dev_err(&vscsi->dev, "cap_mad: cap len is 0\n"); + status = VIOSRP_MAD_FAILED; + break; + } + + switch (common->cap_type) { + default: + pr_debug("cap_mad: unsupported capability\n"); + common->server_support = 0; + flag = cpu_to_be32((u32)CAP_LIST_SUPPORTED); + cap->flags &= ~flag; + break; + } + + len = len - cap_len; + common = (struct mad_capability_common *) + ((char *)common + cap_len); + } + + mad->common.status = cpu_to_be16(status); + + dma_wmb(); + rc = h_copy_rdma(olen, vscsi->dds.window[LOCAL].liobn, token, + vscsi->dds.window[REMOTE].liobn, + be64_to_cpu(mad->buffer)); + + if (rc != H_SUCCESS) { + pr_debug("cap_mad: failed to copy to client, rc %ld\n", + rc); + + if (rc == H_PERMISSION) { + if (connection_broken(vscsi)) + flag_bits = (RESPONSE_Q_DOWN | + CLIENT_FAILED); + } + + pr_warn("cap_mad: error copying data to client, rc %ld\n", + rc); + ibmvscsis_post_disconnect(vscsi, + ERR_DISCONNECT_RECONNECT, + flag_bits); + } + } + + dma_free_coherent(&vscsi->dma_dev->dev, olen, cap, token); + + pr_debug("Leaving cap_mad, rc %ld, client_cap 0x%x\n", + rc, vscsi->client_cap); + + return rc; +} + +/** + * ibmvscsis_process_mad() - Service a MAnagement Data gram + * @vscsi: Pointer to our adapter structure + * @iue: Information Unit containing the MAD request + * + * Must be called with interrupt lock held. + */ +static long ibmvscsis_process_mad(struct scsi_info *vscsi, struct iu_entry *iue) +{ + struct mad_common *mad = (struct mad_common *)&vio_iu(iue)->mad; + struct viosrp_empty_iu *empty; + long rc = ADAPT_SUCCESS; + + switch (be32_to_cpu(mad->type)) { + case VIOSRP_EMPTY_IU_TYPE: + empty = &vio_iu(iue)->mad.empty_iu; + vscsi->empty_iu_id = be64_to_cpu(empty->buffer); + vscsi->empty_iu_tag = be64_to_cpu(empty->common.tag); + mad->status = cpu_to_be16(VIOSRP_MAD_SUCCESS); + break; + case VIOSRP_ADAPTER_INFO_TYPE: + rc = ibmvscsis_adapter_info(vscsi, iue); + break; + case VIOSRP_CAPABILITIES_TYPE: + rc = ibmvscsis_cap_mad(vscsi, iue); + break; + case VIOSRP_ENABLE_FAST_FAIL: + if (vscsi->state == CONNECTED) { + vscsi->fast_fail = true; + mad->status = cpu_to_be16(VIOSRP_MAD_SUCCESS); + } else { + pr_warn("fast fail mad sent after login\n"); + mad->status = cpu_to_be16(VIOSRP_MAD_FAILED); + } + break; + default: + mad->status = cpu_to_be16(VIOSRP_MAD_NOT_SUPPORTED); + break; + } + + return rc; +} + +/** + * srp_snd_msg_failed() - Handle an error when sending a response + * @vscsi: Pointer to our adapter structure + * @rc: The return code from the h_send_crq command + * + * Must be called with interrupt lock held. + */ +static void srp_snd_msg_failed(struct scsi_info *vscsi, long rc) +{ + ktime_t kt; + + if (rc != H_DROPPED) { + ibmvscsis_free_cmd_qs(vscsi); + + if (rc == H_CLOSED) + vscsi->flags |= CLIENT_FAILED; + + /* don't flag the same problem multiple times */ + if (!(vscsi->flags & RESPONSE_Q_DOWN)) { + vscsi->flags |= RESPONSE_Q_DOWN; + if (!(vscsi->state & (ERR_DISCONNECT | + ERR_DISCONNECT_RECONNECT | + ERR_DISCONNECTED | UNDEFINED))) { + dev_err(&vscsi->dev, "snd_msg_failed: setting RESPONSE_Q_DOWN, state 0x%hx, flags 0x%x, rc %ld\n", + vscsi->state, vscsi->flags, rc); + } + ibmvscsis_post_disconnect(vscsi, + ERR_DISCONNECT_RECONNECT, 0); + } + return; + } + + /* + * The response queue is full. + * If the server is processing SRP requests, i.e. + * the client has successfully done an + * SRP_LOGIN, then it will wait forever for room in + * the queue. However if the system admin + * is attempting to unconfigure the server then one + * or more children will be in a state where + * they are being removed. So if there is even one + * child being removed then the driver assumes + * the system admin is attempting to break the + * connection with the client and MAX_TIMER_POPS + * is honored. + */ + if ((vscsi->rsp_q_timer.timer_pops < MAX_TIMER_POPS) || + (vscsi->state == SRP_PROCESSING)) { + pr_debug("snd_msg_failed: response queue full, flags 0x%x, timer started %d, pops %d\n", + vscsi->flags, (int)vscsi->rsp_q_timer.started, + vscsi->rsp_q_timer.timer_pops); + + /* + * Check if the timer is running; if it + * is not then start it up. + */ + if (!vscsi->rsp_q_timer.started) { + if (vscsi->rsp_q_timer.timer_pops < + MAX_TIMER_POPS) { + kt = ktime_set(0, WAIT_NANO_SECONDS); + } else { + /* + * slide the timeslice if the maximum + * timer pops have already happened + */ + kt = ktime_set(WAIT_SECONDS, 0); + } + + vscsi->rsp_q_timer.started = true; + hrtimer_start(&vscsi->rsp_q_timer.timer, kt, + HRTIMER_MODE_REL); + } + } else { + /* + * TBD: Do we need to worry about this? Need to get + * remove working. + */ + /* + * waited a long time and it appears the system admin + * is bring this driver down + */ + vscsi->flags |= RESPONSE_Q_DOWN; + ibmvscsis_free_cmd_qs(vscsi); + /* + * if the driver is already attempting to disconnect + * from the client and has already logged an error + * trace this event but don't put it in the error log + */ + if (!(vscsi->state & (ERR_DISCONNECT | + ERR_DISCONNECT_RECONNECT | + ERR_DISCONNECTED | UNDEFINED))) { + dev_err(&vscsi->dev, "client crq full too long\n"); + ibmvscsis_post_disconnect(vscsi, + ERR_DISCONNECT_RECONNECT, + 0); + } + } +} + +/** + * ibmvscsis_send_messages() - Send a Response + * @vscsi: Pointer to our adapter structure + * + * Send a response, first checking the waiting queue. Responses are + * sent in order they are received. If the response cannot be sent, + * because the client queue is full, it stays on the waiting queue. + * + * PRECONDITION: + * Called with interrupt lock held + */ +static void ibmvscsis_send_messages(struct scsi_info *vscsi) +{ + u64 msg_hi = 0; + /* note do not attmempt to access the IU_data_ptr with this pointer + * it is not valid + */ + struct viosrp_crq *crq = (struct viosrp_crq *)&msg_hi; + struct ibmvscsis_cmd *cmd, *nxt; + struct iu_entry *iue; + long rc = ADAPT_SUCCESS; + + if (!(vscsi->flags & RESPONSE_Q_DOWN)) { + list_for_each_entry_safe(cmd, nxt, &vscsi->waiting_rsp, list) { + pr_debug("send_messages cmd %p\n", cmd); + + iue = cmd->iue; + + crq->valid = VALID_CMD_RESP_EL; + crq->format = cmd->rsp.format; + + if (cmd->flags & CMD_FAST_FAIL) + crq->status = VIOSRP_ADAPTER_FAIL; + + crq->IU_length = cpu_to_be16(cmd->rsp.len); + + rc = h_send_crq(vscsi->dma_dev->unit_address, + be64_to_cpu(msg_hi), + be64_to_cpu(cmd->rsp.tag)); + + pr_debug("send_messages: tag 0x%llx, rc %ld\n", + be64_to_cpu(cmd->rsp.tag), rc); + + /* if all ok free up the command element resources */ + if (rc == H_SUCCESS) { + /* some movement has occurred */ + vscsi->rsp_q_timer.timer_pops = 0; + list_del(&cmd->list); + + ibmvscsis_free_cmd_resources(vscsi, cmd); + } else { + srp_snd_msg_failed(vscsi, rc); + break; + } + } + + if (!rc) { + /* + * The timer could pop with the queue empty. If + * this happens, rc will always indicate a + * success; clear the pop count. + */ + vscsi->rsp_q_timer.timer_pops = 0; + } + } else { + ibmvscsis_free_cmd_qs(vscsi); + } +} + +/* Called with intr lock held */ +static void ibmvscsis_send_mad_resp(struct scsi_info *vscsi, + struct ibmvscsis_cmd *cmd, + struct viosrp_crq *crq) +{ + struct iu_entry *iue = cmd->iue; + struct mad_common *mad = (struct mad_common *)&vio_iu(iue)->mad; + uint flag_bits = 0; + long rc; + + dma_wmb(); + rc = h_copy_rdma(sizeof(struct mad_common), + vscsi->dds.window[LOCAL].liobn, iue->sbuf->dma, + vscsi->dds.window[REMOTE].liobn, + be64_to_cpu(crq->IU_data_ptr)); + if (!rc) { + cmd->rsp.format = VIOSRP_MAD_FORMAT; + cmd->rsp.len = sizeof(struct mad_common); + cmd->rsp.tag = mad->tag; + list_add_tail(&cmd->list, &vscsi->waiting_rsp); + ibmvscsis_send_messages(vscsi); + } else { + pr_debug("Error sending mad response, rc %ld\n", rc); + if (rc == H_PERMISSION) { + if (connection_broken(vscsi)) + flag_bits = (RESPONSE_Q_DOWN | CLIENT_FAILED); + } + dev_err(&vscsi->dev, "mad: failed to copy to client, rc %ld\n", + rc); + + ibmvscsis_free_cmd_resources(vscsi, cmd); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, + flag_bits); + } +} + +/** + * ibmvscsis_mad() - Service a MAnagement Data gram. + * @vscsi: Pointer to our adapter structure + * @crq: Pointer to the CRQ entry containing the MAD request + * + * EXECUTION ENVIRONMENT: + * Interrupt called with adapter lock held + */ +static long ibmvscsis_mad(struct scsi_info *vscsi, struct viosrp_crq *crq) +{ + struct iu_entry *iue; + struct ibmvscsis_cmd *cmd; + struct mad_common *mad; + long rc = ADAPT_SUCCESS; + + switch (vscsi->state) { + /* + * We have not exchanged Init Msgs yet, so this MAD was sent + * before the last Transport Event; client will not be + * expecting a response. + */ + case WAIT_CONNECTION: + pr_debug("mad: in Wait Connection state, ignoring MAD, flags %d\n", + vscsi->flags); + return ADAPT_SUCCESS; + + case SRP_PROCESSING: + case CONNECTED: + break; + + /* + * We should never get here while we're in these states. + * Just log an error and get out. + */ + case UNCONFIGURING: + case WAIT_IDLE: + case ERR_DISCONNECT: + case ERR_DISCONNECT_RECONNECT: + default: + dev_err(&vscsi->dev, "mad: invalid adapter state %d for mad\n", + vscsi->state); + return ADAPT_SUCCESS; + } + + cmd = ibmvscsis_get_free_cmd(vscsi); + if (!cmd) { + dev_err(&vscsi->dev, "mad: failed to get cmd, debit %d\n", + vscsi->debit); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0); + return ERROR; + } + iue = cmd->iue; + cmd->type = ADAPTER_MAD; + + rc = ibmvscsis_copy_crq_packet(vscsi, cmd, crq); + if (!rc) { + mad = (struct mad_common *)&vio_iu(iue)->mad; + + pr_debug("mad: type %d\n", be32_to_cpu(mad->type)); + + if (be16_to_cpu(mad->length) < 0) { + dev_err(&vscsi->dev, "mad: length is < 0\n"); + ibmvscsis_post_disconnect(vscsi, + ERR_DISCONNECT_RECONNECT, 0); + rc = SRP_VIOLATION; + } else { + rc = ibmvscsis_process_mad(vscsi, iue); + } + + pr_debug("mad: status %hd, rc %ld\n", be16_to_cpu(mad->status), + rc); + + if (!rc) + ibmvscsis_send_mad_resp(vscsi, cmd, crq); + } else { + ibmvscsis_free_cmd_resources(vscsi, cmd); + } + + pr_debug("Leaving mad, rc %ld\n", rc); + return rc; +} + +/** + * ibmvscsis_login_rsp() - Create/copy a login response notice to the client + * @vscsi: Pointer to our adapter structure + * @cmd: Pointer to the command for the SRP Login request + * + * EXECUTION ENVIRONMENT: + * Interrupt, interrupt lock held + */ +static long ibmvscsis_login_rsp(struct scsi_info *vscsi, + struct ibmvscsis_cmd *cmd) +{ + struct iu_entry *iue = cmd->iue; + struct srp_login_rsp *rsp = &vio_iu(iue)->srp.login_rsp; + struct format_code *fmt; + uint flag_bits = 0; + long rc = ADAPT_SUCCESS; + + memset(rsp, 0, sizeof(struct srp_login_rsp)); + + rsp->opcode = SRP_LOGIN_RSP; + rsp->req_lim_delta = cpu_to_be32(vscsi->request_limit); + rsp->tag = cmd->rsp.tag; + rsp->max_it_iu_len = cpu_to_be32(SRP_MAX_IU_LEN); + rsp->max_ti_iu_len = cpu_to_be32(SRP_MAX_IU_LEN); + fmt = (struct format_code *)&rsp->buf_fmt; + fmt->buffers = SUPPORTED_FORMATS; + vscsi->credit = 0; + + cmd->rsp.len = sizeof(struct srp_login_rsp); + + dma_wmb(); + rc = h_copy_rdma(cmd->rsp.len, vscsi->dds.window[LOCAL].liobn, + iue->sbuf->dma, vscsi->dds.window[REMOTE].liobn, + be64_to_cpu(iue->remote_token)); + + switch (rc) { + case H_SUCCESS: + break; + + case H_PERMISSION: + if (connection_broken(vscsi)) + flag_bits = RESPONSE_Q_DOWN | CLIENT_FAILED; + dev_err(&vscsi->dev, "login_rsp: error copying to client, rc %ld\n", + rc); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, + flag_bits); + break; + case H_SOURCE_PARM: + case H_DEST_PARM: + default: + dev_err(&vscsi->dev, "login_rsp: error copying to client, rc %ld\n", + rc); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0); + break; + } + + return rc; +} + +/** + * ibmvscsis_srp_login_rej() - Create/copy a login rejection notice to client + * @vscsi: Pointer to our adapter structure + * @cmd: Pointer to the command for the SRP Login request + * @reason: The reason the SRP Login is being rejected, per SRP protocol + * + * EXECUTION ENVIRONMENT: + * Interrupt, interrupt lock held + */ +static long ibmvscsis_srp_login_rej(struct scsi_info *vscsi, + struct ibmvscsis_cmd *cmd, u32 reason) +{ + struct iu_entry *iue = cmd->iue; + struct srp_login_rej *rej = &vio_iu(iue)->srp.login_rej; + struct format_code *fmt; + uint flag_bits = 0; + long rc = ADAPT_SUCCESS; + + memset(rej, 0, sizeof(*rej)); + + rej->opcode = SRP_LOGIN_REJ; + rej->reason = cpu_to_be32(reason); + rej->tag = cmd->rsp.tag; + fmt = (struct format_code *)&rej->buf_fmt; + fmt->buffers = SUPPORTED_FORMATS; + + cmd->rsp.len = sizeof(*rej); + + dma_wmb(); + rc = h_copy_rdma(cmd->rsp.len, vscsi->dds.window[LOCAL].liobn, + iue->sbuf->dma, vscsi->dds.window[REMOTE].liobn, + be64_to_cpu(iue->remote_token)); + + switch (rc) { + case H_SUCCESS: + break; + case H_PERMISSION: + if (connection_broken(vscsi)) + flag_bits = RESPONSE_Q_DOWN | CLIENT_FAILED; + dev_err(&vscsi->dev, "login_rej: error copying to client, rc %ld\n", + rc); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, + flag_bits); + break; + case H_SOURCE_PARM: + case H_DEST_PARM: + default: + dev_err(&vscsi->dev, "login_rej: error copying to client, rc %ld\n", + rc); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0); + break; + } + + return rc; +} + +static int ibmvscsis_make_nexus(struct ibmvscsis_tport *tport) +{ + char *name = tport->tport_name; + struct ibmvscsis_nexus *nexus; + int rc; + + if (tport->ibmv_nexus) { + pr_debug("tport->ibmv_nexus already exists\n"); + return 0; + } + + nexus = kzalloc(sizeof(*nexus), GFP_KERNEL); + if (!nexus) { + pr_err("Unable to allocate struct ibmvscsis_nexus\n"); + return -ENOMEM; + } + + nexus->se_sess = target_alloc_session(&tport->se_tpg, 0, 0, + TARGET_PROT_NORMAL, name, nexus, + NULL); + if (IS_ERR(nexus->se_sess)) { + rc = PTR_ERR(nexus->se_sess); + goto transport_init_fail; + } + + tport->ibmv_nexus = nexus; + + return 0; + +transport_init_fail: + kfree(nexus); + return rc; +} + +static int ibmvscsis_drop_nexus(struct ibmvscsis_tport *tport) +{ + struct se_session *se_sess; + struct ibmvscsis_nexus *nexus; + + nexus = tport->ibmv_nexus; + if (!nexus) + return -ENODEV; + + se_sess = nexus->se_sess; + if (!se_sess) + return -ENODEV; + + /* + * Release the SCSI I_T Nexus to the emulated ibmvscsis Target Port + */ + transport_deregister_session(se_sess); + tport->ibmv_nexus = NULL; + kfree(nexus); + + return 0; +} + +/** + * ibmvscsis_srp_login() - Process an SRP Login Request + * @vscsi: Pointer to our adapter structure + * @cmd: Command element to use to process the SRP Login request + * @crq: Pointer to CRQ entry containing the SRP Login request + * + * EXECUTION ENVIRONMENT: + * Interrupt, called with interrupt lock held + */ +static long ibmvscsis_srp_login(struct scsi_info *vscsi, + struct ibmvscsis_cmd *cmd, + struct viosrp_crq *crq) +{ + struct iu_entry *iue = cmd->iue; + struct srp_login_req *req = &vio_iu(iue)->srp.login_req; + struct port_id { + __be64 id_extension; + __be64 io_guid; + } *iport, *tport; + struct format_code *fmt; + u32 reason = 0x0; + long rc = ADAPT_SUCCESS; + + iport = (struct port_id *)req->initiator_port_id; + tport = (struct port_id *)req->target_port_id; + fmt = (struct format_code *)&req->req_buf_fmt; + if (be32_to_cpu(req->req_it_iu_len) > SRP_MAX_IU_LEN) + reason = SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE; + else if (be32_to_cpu(req->req_it_iu_len) < 64) + reason = SRP_LOGIN_REJ_UNABLE_ESTABLISH_CHANNEL; + else if ((be64_to_cpu(iport->id_extension) > (MAX_NUM_PORTS - 1)) || + (be64_to_cpu(tport->id_extension) > (MAX_NUM_PORTS - 1))) + reason = SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL; + else if (req->req_flags & SRP_MULTICHAN_MULTI) + reason = SRP_LOGIN_REJ_MULTI_CHANNEL_UNSUPPORTED; + else if (fmt->buffers & (~SUPPORTED_FORMATS)) + reason = SRP_LOGIN_REJ_UNSUPPORTED_DESCRIPTOR_FMT; + else if ((fmt->buffers | SUPPORTED_FORMATS) == 0) + reason = SRP_LOGIN_REJ_UNSUPPORTED_DESCRIPTOR_FMT; + + if (vscsi->state == SRP_PROCESSING) + reason = SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED; + + rc = ibmvscsis_make_nexus(&vscsi->tport); + if (rc) + reason = SRP_LOGIN_REJ_UNABLE_ESTABLISH_CHANNEL; + + cmd->rsp.format = VIOSRP_SRP_FORMAT; + cmd->rsp.tag = req->tag; + + pr_debug("srp_login: reason 0x%x\n", reason); + + if (reason) + rc = ibmvscsis_srp_login_rej(vscsi, cmd, reason); + else + rc = ibmvscsis_login_rsp(vscsi, cmd); + + if (!rc) { + if (!reason) + vscsi->state = SRP_PROCESSING; + + list_add_tail(&cmd->list, &vscsi->waiting_rsp); + ibmvscsis_send_messages(vscsi); + } else { + ibmvscsis_free_cmd_resources(vscsi, cmd); + } + + pr_debug("Leaving srp_login, rc %ld\n", rc); + return rc; +} + +/** + * ibmvscsis_srp_i_logout() - Helper Function to close I_T Nexus + * @vscsi: Pointer to our adapter structure + * @cmd: Command element to use to process the Implicit Logout request + * @crq: Pointer to CRQ entry containing the Implicit Logout request + * + * Do the logic to close the I_T nexus. This function may not + * behave to specification. + * + * EXECUTION ENVIRONMENT: + * Interrupt, interrupt lock held + */ +static long ibmvscsis_srp_i_logout(struct scsi_info *vscsi, + struct ibmvscsis_cmd *cmd, + struct viosrp_crq *crq) +{ + struct iu_entry *iue = cmd->iue; + struct srp_i_logout *log_out = &vio_iu(iue)->srp.i_logout; + long rc = ADAPT_SUCCESS; + + if ((vscsi->debit > 0) || !list_empty(&vscsi->schedule_q) || + !list_empty(&vscsi->waiting_rsp)) { + dev_err(&vscsi->dev, "i_logout: outstanding work\n"); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT, 0); + } else { + cmd->rsp.format = SRP_FORMAT; + cmd->rsp.tag = log_out->tag; + cmd->rsp.len = sizeof(struct mad_common); + list_add_tail(&cmd->list, &vscsi->waiting_rsp); + ibmvscsis_send_messages(vscsi); + + ibmvscsis_post_disconnect(vscsi, WAIT_IDLE, 0); + } + + return rc; +} + +/* Called with intr lock held */ +static void ibmvscsis_srp_cmd(struct scsi_info *vscsi, struct viosrp_crq *crq) +{ + struct ibmvscsis_cmd *cmd; + struct iu_entry *iue; + struct srp_cmd *srp; + struct srp_tsk_mgmt *tsk; + long rc; + + if (vscsi->request_limit - vscsi->debit <= 0) { + /* Client has exceeded request limit */ + dev_err(&vscsi->dev, "Client exceeded the request limit (%d), debit %d\n", + vscsi->request_limit, vscsi->debit); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0); + return; + } + + cmd = ibmvscsis_get_free_cmd(vscsi); + if (!cmd) { + dev_err(&vscsi->dev, "srp_cmd failed to get cmd, debit %d\n", + vscsi->debit); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0); + return; + } + iue = cmd->iue; + srp = &vio_iu(iue)->srp.cmd; + + rc = ibmvscsis_copy_crq_packet(vscsi, cmd, crq); + if (rc) { + ibmvscsis_free_cmd_resources(vscsi, cmd); + return; + } + + if (vscsi->state == SRP_PROCESSING) { + switch (srp->opcode) { + case SRP_LOGIN_REQ: + rc = ibmvscsis_srp_login(vscsi, cmd, crq); + break; + + case SRP_TSK_MGMT: + tsk = &vio_iu(iue)->srp.tsk_mgmt; + pr_debug("tsk_mgmt tag: %llu (0x%llx)\n", tsk->tag, + tsk->tag); + cmd->rsp.tag = tsk->tag; + vscsi->debit += 1; + cmd->type = TASK_MANAGEMENT; + list_add_tail(&cmd->list, &vscsi->schedule_q); + queue_work(vscsi->work_q, &cmd->work); + break; + + case SRP_CMD: + pr_debug("srp_cmd tag: %llu (0x%llx)\n", srp->tag, + srp->tag); + cmd->rsp.tag = srp->tag; + vscsi->debit += 1; + cmd->type = SCSI_CDB; + /* + * We want to keep track of work waiting for + * the workqueue. + */ + list_add_tail(&cmd->list, &vscsi->schedule_q); + queue_work(vscsi->work_q, &cmd->work); + break; + + case SRP_I_LOGOUT: + rc = ibmvscsis_srp_i_logout(vscsi, cmd, crq); + break; + + case SRP_CRED_RSP: + case SRP_AER_RSP: + default: + ibmvscsis_free_cmd_resources(vscsi, cmd); + dev_err(&vscsi->dev, "invalid srp cmd, opcode %d\n", + (uint)srp->opcode); + ibmvscsis_post_disconnect(vscsi, + ERR_DISCONNECT_RECONNECT, 0); + break; + } + } else if (srp->opcode == SRP_LOGIN_REQ && vscsi->state == CONNECTED) { + rc = ibmvscsis_srp_login(vscsi, cmd, crq); + } else { + ibmvscsis_free_cmd_resources(vscsi, cmd); + dev_err(&vscsi->dev, "Invalid state %d to handle srp cmd\n", + vscsi->state); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0); + } +} + +/** + * ibmvscsis_ping_response() - Respond to a ping request + * @vscsi: Pointer to our adapter structure + * + * Let the client know that the server is alive and waiting on + * its native I/O stack. + * If any type of error occurs from the call to queue a ping + * response then the client is either not accepting or receiving + * interrupts. Disconnect with an error. + * + * EXECUTION ENVIRONMENT: + * Interrupt, interrupt lock held + */ +static long ibmvscsis_ping_response(struct scsi_info *vscsi) +{ + struct viosrp_crq *crq; + u64 buffer[2] = { 0, 0 }; + long rc; + + crq = (struct viosrp_crq *)&buffer; + crq->valid = VALID_CMD_RESP_EL; + crq->format = (u8)MESSAGE_IN_CRQ; + crq->status = PING_RESPONSE; + + rc = h_send_crq(vscsi->dds.unit_id, cpu_to_be64(buffer[MSG_HI]), + cpu_to_be64(buffer[MSG_LOW])); + + switch (rc) { + case H_SUCCESS: + break; + case H_CLOSED: + vscsi->flags |= CLIENT_FAILED; + case H_DROPPED: + vscsi->flags |= RESPONSE_Q_DOWN; + case H_REMOTE_PARM: + dev_err(&vscsi->dev, "ping_response: h_send_crq failed, rc %ld\n", + rc); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0); + break; + default: + dev_err(&vscsi->dev, "ping_response: h_send_crq returned unknown rc %ld\n", + rc); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT, 0); + break; + } + + return rc; +} + +/** + * ibmvscsis_handle_init_compl_msg() - Respond to an Init Complete Message + * @vscsi: Pointer to our adapter structure + * + * Must be called with interrupt lock held. + */ +static long ibmvscsis_handle_init_compl_msg(struct scsi_info *vscsi) +{ + long rc = ADAPT_SUCCESS; + + switch (vscsi->state) { + case NO_QUEUE: + case ERR_DISCONNECT: + case ERR_DISCONNECT_RECONNECT: + case ERR_DISCONNECTED: + case UNCONFIGURING: + case UNDEFINED: + rc = ERROR; + break; + + case WAIT_CONNECTION: + vscsi->state = CONNECTED; + break; + + case WAIT_IDLE: + case SRP_PROCESSING: + case CONNECTED: + case WAIT_ENABLED: + case PART_UP_WAIT_ENAB: + default: + rc = ERROR; + dev_err(&vscsi->dev, "init_msg: invalid state %d to get init compl msg\n", + vscsi->state); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0); + break; + } + + return rc; +} + +/** + * ibmvscsis_handle_init_msg() - Respond to an Init Message + * @vscsi: Pointer to our adapter structure + * + * Must be called with interrupt lock held. + */ +static long ibmvscsis_handle_init_msg(struct scsi_info *vscsi) +{ + long rc = ADAPT_SUCCESS; + + switch (vscsi->state) { + case WAIT_ENABLED: + vscsi->state = PART_UP_WAIT_ENAB; + break; + + case WAIT_CONNECTION: + rc = ibmvscsis_send_init_message(vscsi, INIT_COMPLETE_MSG); + switch (rc) { + case H_SUCCESS: + vscsi->state = CONNECTED; + break; + + case H_PARAMETER: + dev_err(&vscsi->dev, "init_msg: failed to send, rc %ld\n", + rc); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT, 0); + break; + + case H_DROPPED: + dev_err(&vscsi->dev, "init_msg: failed to send, rc %ld\n", + rc); + rc = ERROR; + ibmvscsis_post_disconnect(vscsi, + ERR_DISCONNECT_RECONNECT, 0); + break; + + case H_CLOSED: + pr_warn("init_msg: failed to send, rc %ld\n", rc); + rc = 0; + break; + } + break; + + case UNDEFINED: + rc = ERROR; + break; + + case UNCONFIGURING: + break; + + case PART_UP_WAIT_ENAB: + case CONNECTED: + case SRP_PROCESSING: + case WAIT_IDLE: + case NO_QUEUE: + case ERR_DISCONNECT: + case ERR_DISCONNECT_RECONNECT: + case ERR_DISCONNECTED: + default: + rc = ERROR; + dev_err(&vscsi->dev, "init_msg: invalid state %d to get init msg\n", + vscsi->state); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0); + break; + } + + return rc; +} + +/** + * ibmvscsis_init_msg() - Respond to an init message + * @vscsi: Pointer to our adapter structure + * @crq: Pointer to CRQ element containing the Init Message + * + * EXECUTION ENVIRONMENT: + * Interrupt, interrupt lock held + */ +static long ibmvscsis_init_msg(struct scsi_info *vscsi, struct viosrp_crq *crq) +{ + long rc = ADAPT_SUCCESS; + + pr_debug("init_msg: state 0x%hx\n", vscsi->state); + + rc = h_vioctl(vscsi->dds.unit_id, H_GET_PARTNER_INFO, + (u64)vscsi->map_ioba | ((u64)PAGE_SIZE << 32), 0, 0, 0, + 0); + if (rc == H_SUCCESS) { + vscsi->client_data.partition_number = + be64_to_cpu(*(u64 *)vscsi->map_buf); + pr_debug("init_msg, part num %d\n", + vscsi->client_data.partition_number); + } else { + pr_debug("init_msg h_vioctl rc %ld\n", rc); + rc = ADAPT_SUCCESS; + } + + if (crq->format == INIT_MSG) { + rc = ibmvscsis_handle_init_msg(vscsi); + } else if (crq->format == INIT_COMPLETE_MSG) { + rc = ibmvscsis_handle_init_compl_msg(vscsi); + } else { + rc = ERROR; + dev_err(&vscsi->dev, "init_msg: invalid format %d\n", + (uint)crq->format); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0); + } + + return rc; +} + +/** + * ibmvscsis_parse_command() - Parse an element taken from the cmd rsp queue. + * @vscsi: Pointer to our adapter structure + * @crq: Pointer to CRQ element containing the SRP request + * + * This function will return success if the command queue element is valid + * and the srp iu or MAD request it pointed to was also valid. That does + * not mean that an error was not returned to the client. + * + * EXECUTION ENVIRONMENT: + * Interrupt, intr lock held + */ +static long ibmvscsis_parse_command(struct scsi_info *vscsi, + struct viosrp_crq *crq) +{ + long rc = ADAPT_SUCCESS; + + switch (crq->valid) { + case VALID_CMD_RESP_EL: + switch (crq->format) { + case OS400_FORMAT: + case AIX_FORMAT: + case LINUX_FORMAT: + case MAD_FORMAT: + if (vscsi->flags & PROCESSING_MAD) { + rc = ERROR; + dev_err(&vscsi->dev, "parse_command: already processing mad\n"); + ibmvscsis_post_disconnect(vscsi, + ERR_DISCONNECT_RECONNECT, + 0); + } else { + vscsi->flags |= PROCESSING_MAD; + rc = ibmvscsis_mad(vscsi, crq); + } + break; + + case SRP_FORMAT: + ibmvscsis_srp_cmd(vscsi, crq); + break; + + case MESSAGE_IN_CRQ: + if (crq->status == PING) + ibmvscsis_ping_response(vscsi); + break; + + default: + dev_err(&vscsi->dev, "parse_command: invalid format %d\n", + (uint)crq->format); + ibmvscsis_post_disconnect(vscsi, + ERR_DISCONNECT_RECONNECT, 0); + break; + } + break; + + case VALID_TRANS_EVENT: + rc = ibmvscsis_trans_event(vscsi, crq); + break; + + case VALID_INIT_MSG: + rc = ibmvscsis_init_msg(vscsi, crq); + break; + + default: + dev_err(&vscsi->dev, "parse_command: invalid valid field %d\n", + (uint)crq->valid); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0); + break; + } + + /* + * Return only what the interrupt handler cares + * about. Most errors we keep right on trucking. + */ + rc = vscsi->flags & SCHEDULE_DISCONNECT; + + return rc; +} + +static int read_dma_window(struct scsi_info *vscsi) +{ + struct vio_dev *vdev = vscsi->dma_dev; + const __be32 *dma_window; + const __be32 *prop; + + /* TODO Using of_parse_dma_window would be better, but it doesn't give + * a way to read multiple windows without already knowing the size of + * a window or the number of windows. + */ + dma_window = (const __be32 *)vio_get_attribute(vdev, + "ibm,my-dma-window", + NULL); + if (!dma_window) { + pr_err("Couldn't find ibm,my-dma-window property\n"); + return -1; + } + + vscsi->dds.window[LOCAL].liobn = be32_to_cpu(*dma_window); + dma_window++; + + prop = (const __be32 *)vio_get_attribute(vdev, "ibm,#dma-address-cells", + NULL); + if (!prop) { + pr_warn("Couldn't find ibm,#dma-address-cells property\n"); + dma_window++; + } else { + dma_window += be32_to_cpu(*prop); + } + + prop = (const __be32 *)vio_get_attribute(vdev, "ibm,#dma-size-cells", + NULL); + if (!prop) { + pr_warn("Couldn't find ibm,#dma-size-cells property\n"); + dma_window++; + } else { + dma_window += be32_to_cpu(*prop); + } + + /* dma_window should point to the second window now */ + vscsi->dds.window[REMOTE].liobn = be32_to_cpu(*dma_window); + + return 0; +} + +static struct ibmvscsis_tport *ibmvscsis_lookup_port(const char *name) +{ + struct ibmvscsis_tport *tport = NULL; + struct vio_dev *vdev; + struct scsi_info *vscsi; + + spin_lock_bh(&ibmvscsis_dev_lock); + list_for_each_entry(vscsi, &ibmvscsis_dev_list, list) { + vdev = vscsi->dma_dev; + if (!strcmp(dev_name(&vdev->dev), name)) { + tport = &vscsi->tport; + break; + } + } + spin_unlock_bh(&ibmvscsis_dev_lock); + + return tport; +} + +/** + * ibmvscsis_parse_cmd() - Parse SRP Command + * @vscsi: Pointer to our adapter structure + * @cmd: Pointer to command element with SRP command + * + * Parse the srp command; if it is valid then submit it to tcm. + * Note: The return code does not reflect the status of the SCSI CDB. + * + * EXECUTION ENVIRONMENT: + * Process level + */ +static void ibmvscsis_parse_cmd(struct scsi_info *vscsi, + struct ibmvscsis_cmd *cmd) +{ + struct iu_entry *iue = cmd->iue; + struct srp_cmd *srp = (struct srp_cmd *)iue->sbuf->buf; + struct ibmvscsis_nexus *nexus; + u64 data_len = 0; + enum dma_data_direction dir; + int attr = 0; + int rc = 0; + + nexus = vscsi->tport.ibmv_nexus; + /* + * additional length in bytes. Note that the SRP spec says that + * additional length is in 4-byte words, but technically the + * additional length field is only the upper 6 bits of the byte. + * The lower 2 bits are reserved. If the lower 2 bits are 0 (as + * all reserved fields should be), then interpreting the byte as + * an int will yield the length in bytes. + */ + if (srp->add_cdb_len & 0x03) { + dev_err(&vscsi->dev, "parse_cmd: reserved bits set in IU\n"); + spin_lock_bh(&vscsi->intr_lock); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0); + ibmvscsis_free_cmd_resources(vscsi, cmd); + spin_unlock_bh(&vscsi->intr_lock); + return; + } + + if (srp_get_desc_table(srp, &dir, &data_len)) { + dev_err(&vscsi->dev, "0x%llx: parsing SRP descriptor table failed.\n", + srp->tag); + goto fail; + return; + } + + cmd->rsp.sol_not = srp->sol_not; + + switch (srp->task_attr) { + case SRP_SIMPLE_TASK: + attr = TCM_SIMPLE_TAG; + break; + case SRP_ORDERED_TASK: + attr = TCM_ORDERED_TAG; + break; + case SRP_HEAD_TASK: + attr = TCM_HEAD_TAG; + break; + case SRP_ACA_TASK: + attr = TCM_ACA_TAG; + break; + default: + dev_err(&vscsi->dev, "Invalid task attribute %d\n", + srp->task_attr); + goto fail; + } + + cmd->se_cmd.tag = be64_to_cpu(srp->tag); + + spin_lock_bh(&vscsi->intr_lock); + list_add_tail(&cmd->list, &vscsi->active_q); + spin_unlock_bh(&vscsi->intr_lock); + + srp->lun.scsi_lun[0] &= 0x3f; + + pr_debug("calling submit_cmd, se_cmd %p, lun 0x%llx, cdb 0x%x, attr:%d\n", + &cmd->se_cmd, scsilun_to_int(&srp->lun), (int)srp->cdb[0], + attr); + + rc = target_submit_cmd(&cmd->se_cmd, nexus->se_sess, srp->cdb, + cmd->sense_buf, scsilun_to_int(&srp->lun), + data_len, attr, dir, 0); + if (rc) { + dev_err(&vscsi->dev, "target_submit_cmd failed, rc %d\n", rc); + goto fail; + } + return; + +fail: + spin_lock_bh(&vscsi->intr_lock); + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0); + spin_unlock_bh(&vscsi->intr_lock); +} + +/** + * ibmvscsis_parse_task() - Parse SRP Task Management Request + * @vscsi: Pointer to our adapter structure + * @cmd: Pointer to command element with SRP task management request + * + * Parse the srp task management request; if it is valid then submit it to tcm. + * Note: The return code does not reflect the status of the task management + * request. + * + * EXECUTION ENVIRONMENT: + * Processor level + */ +static void ibmvscsis_parse_task(struct scsi_info *vscsi, + struct ibmvscsis_cmd *cmd) +{ + struct iu_entry *iue = cmd->iue; + struct srp_tsk_mgmt *srp_tsk = &vio_iu(iue)->srp.tsk_mgmt; + int tcm_type; + u64 tag_to_abort = 0; + int rc = 0; + struct ibmvscsis_nexus *nexus; + + nexus = vscsi->tport.ibmv_nexus; + + cmd->rsp.sol_not = srp_tsk->sol_not; + + switch (srp_tsk->tsk_mgmt_func) { + case SRP_TSK_ABORT_TASK: + tcm_type = TMR_ABORT_TASK; + tag_to_abort = be64_to_cpu(srp_tsk->task_tag); + break; + case SRP_TSK_ABORT_TASK_SET: + tcm_type = TMR_ABORT_TASK_SET; + break; + case SRP_TSK_CLEAR_TASK_SET: + tcm_type = TMR_CLEAR_TASK_SET; + break; + case SRP_TSK_LUN_RESET: + tcm_type = TMR_LUN_RESET; + break; + case SRP_TSK_CLEAR_ACA: + tcm_type = TMR_CLEAR_ACA; + break; + default: + dev_err(&vscsi->dev, "unknown task mgmt func %d\n", + srp_tsk->tsk_mgmt_func); + cmd->se_cmd.se_tmr_req->response = + TMR_TASK_MGMT_FUNCTION_NOT_SUPPORTED; + rc = -1; + break; + } + + if (!rc) { + cmd->se_cmd.tag = be64_to_cpu(srp_tsk->tag); + + spin_lock_bh(&vscsi->intr_lock); + list_add_tail(&cmd->list, &vscsi->active_q); + spin_unlock_bh(&vscsi->intr_lock); + + srp_tsk->lun.scsi_lun[0] &= 0x3f; + + pr_debug("calling submit_tmr, func %d\n", + srp_tsk->tsk_mgmt_func); + rc = target_submit_tmr(&cmd->se_cmd, nexus->se_sess, NULL, + scsilun_to_int(&srp_tsk->lun), srp_tsk, + tcm_type, GFP_KERNEL, tag_to_abort, 0); + if (rc) { + dev_err(&vscsi->dev, "target_submit_tmr failed, rc %d\n", + rc); + cmd->se_cmd.se_tmr_req->response = + TMR_FUNCTION_REJECTED; + } + } + + if (rc) + transport_send_check_condition_and_sense(&cmd->se_cmd, 0, 0); +} + +static void ibmvscsis_scheduler(struct work_struct *work) +{ + struct ibmvscsis_cmd *cmd = container_of(work, struct ibmvscsis_cmd, + work); + struct scsi_info *vscsi = cmd->adapter; + + spin_lock_bh(&vscsi->intr_lock); + + /* Remove from schedule_q */ + list_del(&cmd->list); + + /* Don't submit cmd if we're disconnecting */ + if (vscsi->flags & (SCHEDULE_DISCONNECT | DISCONNECT_SCHEDULED)) { + ibmvscsis_free_cmd_resources(vscsi, cmd); + + /* ibmvscsis_disconnect might be waiting for us */ + if (list_empty(&vscsi->active_q) && + list_empty(&vscsi->schedule_q) && + (vscsi->flags & WAIT_FOR_IDLE)) { + vscsi->flags &= ~WAIT_FOR_IDLE; + complete(&vscsi->wait_idle); + } + + spin_unlock_bh(&vscsi->intr_lock); + return; + } + + spin_unlock_bh(&vscsi->intr_lock); + + switch (cmd->type) { + case SCSI_CDB: + ibmvscsis_parse_cmd(vscsi, cmd); + break; + case TASK_MANAGEMENT: + ibmvscsis_parse_task(vscsi, cmd); + break; + default: + dev_err(&vscsi->dev, "scheduler, invalid cmd type %d\n", + cmd->type); + spin_lock_bh(&vscsi->intr_lock); + ibmvscsis_free_cmd_resources(vscsi, cmd); + spin_unlock_bh(&vscsi->intr_lock); + break; + } +} + +static int ibmvscsis_alloc_cmds(struct scsi_info *vscsi, int num) +{ + struct ibmvscsis_cmd *cmd; + int i; + + INIT_LIST_HEAD(&vscsi->free_cmd); + vscsi->cmd_pool = kcalloc(num, sizeof(struct ibmvscsis_cmd), + GFP_KERNEL); + if (!vscsi->cmd_pool) + return -ENOMEM; + + for (i = 0, cmd = (struct ibmvscsis_cmd *)vscsi->cmd_pool; i < num; + i++, cmd++) { + cmd->adapter = vscsi; + INIT_WORK(&cmd->work, ibmvscsis_scheduler); + list_add_tail(&cmd->list, &vscsi->free_cmd); + } + + return 0; +} + +static void ibmvscsis_free_cmds(struct scsi_info *vscsi) +{ + kfree(vscsi->cmd_pool); + vscsi->cmd_pool = NULL; + INIT_LIST_HEAD(&vscsi->free_cmd); +} + +/** + * ibmvscsis_service_wait_q() - Service Waiting Queue + * @timer: Pointer to timer which has expired + * + * This routine is called when the timer pops to service the waiting + * queue. Elements on the queue have completed, their responses have been + * copied to the client, but the client's response queue was full so + * the queue message could not be sent. The routine grabs the proper locks + * and calls send messages. + * + * EXECUTION ENVIRONMENT: + * called at interrupt level + */ +static enum hrtimer_restart ibmvscsis_service_wait_q(struct hrtimer *timer) +{ + struct timer_cb *p_timer = container_of(timer, struct timer_cb, timer); + struct scsi_info *vscsi = container_of(p_timer, struct scsi_info, + rsp_q_timer); + + spin_lock_bh(&vscsi->intr_lock); + p_timer->timer_pops += 1; + p_timer->started = false; + ibmvscsis_send_messages(vscsi); + spin_unlock_bh(&vscsi->intr_lock); + + return HRTIMER_NORESTART; +} + +static long ibmvscsis_alloctimer(struct scsi_info *vscsi) +{ + struct timer_cb *p_timer; + + p_timer = &vscsi->rsp_q_timer; + hrtimer_init(&p_timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + + p_timer->timer.function = ibmvscsis_service_wait_q; + p_timer->started = false; + p_timer->timer_pops = 0; + + return ADAPT_SUCCESS; +} + +static void ibmvscsis_freetimer(struct scsi_info *vscsi) +{ + struct timer_cb *p_timer; + + p_timer = &vscsi->rsp_q_timer; + + (void)hrtimer_cancel(&p_timer->timer); + + p_timer->started = false; + p_timer->timer_pops = 0; +} + +static irqreturn_t ibmvscsis_interrupt(int dummy, void *data) +{ + struct scsi_info *vscsi = data; + + vio_disable_interrupts(vscsi->dma_dev); + tasklet_schedule(&vscsi->work_task); + + return IRQ_HANDLED; +} + +/** + * ibmvscsis_check_q() - Helper function to Check Init Message Valid + * @vscsi: Pointer to our adapter structure + * + * Checks if a initialize message was queued by the initiatior + * while the timing window was open. This function is called from + * probe after the CRQ is created and interrupts are enabled. + * It would only be used by adapters who wait for some event before + * completing the init handshake with the client. For ibmvscsi, this + * event is waiting for the port to be enabled. + * + * EXECUTION ENVIRONMENT: + * Process level only, interrupt lock held + */ +static long ibmvscsis_check_q(struct scsi_info *vscsi) +{ + uint format; + long rc; + + rc = ibmvscsis_check_init_msg(vscsi, &format); + if (rc) + ibmvscsis_post_disconnect(vscsi, ERR_DISCONNECT_RECONNECT, 0); + else if (format == UNUSED_FORMAT) + vscsi->state = WAIT_ENABLED; + else + vscsi->state = PART_UP_WAIT_ENAB; + + return rc; +} + +/** + * ibmvscsis_enable_change_state() - Set new state based on enabled status + * @vscsi: Pointer to our adapter structure + * + * This function determines our new state now that we are enabled. This + * may involve sending an Init Complete message to the client. + * + * Must be called with interrupt lock held. + */ +static long ibmvscsis_enable_change_state(struct scsi_info *vscsi) +{ + long rc = ADAPT_SUCCESS; + +handle_state_change: + switch (vscsi->state) { + case WAIT_ENABLED: + rc = ibmvscsis_send_init_message(vscsi, INIT_MSG); + switch (rc) { + case H_SUCCESS: + case H_DROPPED: + case H_CLOSED: + vscsi->state = WAIT_CONNECTION; + rc = ADAPT_SUCCESS; + break; + + case H_PARAMETER: + break; + + case H_HARDWARE: + break; + + default: + vscsi->state = UNDEFINED; + rc = H_HARDWARE; + break; + } + break; + case PART_UP_WAIT_ENAB: + rc = ibmvscsis_send_init_message(vscsi, INIT_COMPLETE_MSG); + switch (rc) { + case H_SUCCESS: + vscsi->state = CONNECTED; + rc = ADAPT_SUCCESS; + break; + + case H_DROPPED: + case H_CLOSED: + vscsi->state = WAIT_ENABLED; + goto handle_state_change; + + case H_PARAMETER: + break; + + case H_HARDWARE: + break; + + default: + rc = H_HARDWARE; + break; + } + break; + + case WAIT_CONNECTION: + case WAIT_IDLE: + case SRP_PROCESSING: + case CONNECTED: + rc = ADAPT_SUCCESS; + break; + /* should not be able to get here */ + case UNCONFIGURING: + rc = ERROR; + vscsi->state = UNDEFINED; + break; + + /* driver should never allow this to happen */ + case ERR_DISCONNECT: + case ERR_DISCONNECT_RECONNECT: + default: + dev_err(&vscsi->dev, "in invalid state %d during enable_change_state\n", + vscsi->state); + rc = ADAPT_SUCCESS; + break; + } + + return rc; +} + +/** + * ibmvscsis_create_command_q() - Create Command Queue + * @vscsi: Pointer to our adapter structure + * @num_cmds: Currently unused. In the future, may be used to determine + * the size of the CRQ. + * + * Allocates memory for command queue maps remote memory into an ioba + * initializes the command response queue + * + * EXECUTION ENVIRONMENT: + * Process level only + */ +static long ibmvscsis_create_command_q(struct scsi_info *vscsi, int num_cmds) +{ + long rc = 0; + int pages; + struct vio_dev *vdev = vscsi->dma_dev; + + /* We might support multiple pages in the future, but just 1 for now */ + pages = 1; + + vscsi->cmd_q.size = pages; + + vscsi->cmd_q.base_addr = + (struct viosrp_crq *)get_zeroed_page(GFP_KERNEL); + if (!vscsi->cmd_q.base_addr) + return -ENOMEM; + + vscsi->cmd_q.mask = ((uint)pages * CRQ_PER_PAGE) - 1; + + vscsi->cmd_q.crq_token = dma_map_single(&vdev->dev, + vscsi->cmd_q.base_addr, + PAGE_SIZE, DMA_BIDIRECTIONAL); + if (dma_mapping_error(&vdev->dev, vscsi->cmd_q.crq_token)) { + free_page((unsigned long)vscsi->cmd_q.base_addr); + return -ENOMEM; + } + + rc = h_reg_crq(vscsi->dds.unit_id, vscsi->cmd_q.crq_token, PAGE_SIZE); + if (rc) { + if (rc == H_CLOSED) { + vscsi->state = WAIT_ENABLED; + rc = 0; + } else { + dma_unmap_single(&vdev->dev, vscsi->cmd_q.crq_token, + PAGE_SIZE, DMA_BIDIRECTIONAL); + free_page((unsigned long)vscsi->cmd_q.base_addr); + rc = -ENODEV; + } + } else { + vscsi->state = WAIT_ENABLED; + } + + return rc; +} + +/** + * ibmvscsis_destroy_command_q - Destroy Command Queue + * @vscsi: Pointer to our adapter structure + * + * Releases memory for command queue and unmaps mapped remote memory. + * + * EXECUTION ENVIRONMENT: + * Process level only + */ +static void ibmvscsis_destroy_command_q(struct scsi_info *vscsi) +{ + dma_unmap_single(&vscsi->dma_dev->dev, vscsi->cmd_q.crq_token, + PAGE_SIZE, DMA_BIDIRECTIONAL); + free_page((unsigned long)vscsi->cmd_q.base_addr); + vscsi->cmd_q.base_addr = NULL; + vscsi->state = NO_QUEUE; +} + +static u8 ibmvscsis_fast_fail(struct scsi_info *vscsi, + struct ibmvscsis_cmd *cmd) +{ + struct iu_entry *iue = cmd->iue; + struct se_cmd *se_cmd = &cmd->se_cmd; + struct srp_cmd *srp = (struct srp_cmd *)iue->sbuf->buf; + struct scsi_sense_hdr sshdr; + u8 rc = se_cmd->scsi_status; + + if (vscsi->fast_fail && (READ_CMD(srp->cdb) || WRITE_CMD(srp->cdb))) + if (scsi_normalize_sense(se_cmd->sense_buffer, + se_cmd->scsi_sense_length, &sshdr)) + if (sshdr.sense_key == HARDWARE_ERROR && + (se_cmd->residual_count == 0 || + se_cmd->residual_count == se_cmd->data_length)) { + rc = NO_SENSE; + cmd->flags |= CMD_FAST_FAIL; + } + + return rc; +} + +/** + * srp_build_response() - Build an SRP response buffer + * @vscsi: Pointer to our adapter structure + * @cmd: Pointer to command for which to send the response + * @len_p: Where to return the length of the IU response sent. This + * is needed to construct the CRQ response. + * + * Build the SRP response buffer and copy it to the client's memory space. + */ +static long srp_build_response(struct scsi_info *vscsi, + struct ibmvscsis_cmd *cmd, uint *len_p) +{ + struct iu_entry *iue = cmd->iue; + struct se_cmd *se_cmd = &cmd->se_cmd; + struct srp_rsp *rsp; + uint len; + u32 rsp_code; + char *data; + u32 *tsk_status; + long rc = ADAPT_SUCCESS; + + spin_lock_bh(&vscsi->intr_lock); + + rsp = &vio_iu(iue)->srp.rsp; + len = sizeof(*rsp); + memset(rsp, 0, len); + data = rsp->data; + + rsp->opcode = SRP_RSP; + + if (vscsi->credit > 0 && vscsi->state == SRP_PROCESSING) + rsp->req_lim_delta = cpu_to_be32(vscsi->credit); + else + rsp->req_lim_delta = cpu_to_be32(1 + vscsi->credit); + rsp->tag = cmd->rsp.tag; + rsp->flags = 0; + + if (cmd->type == SCSI_CDB) { + rsp->status = ibmvscsis_fast_fail(vscsi, cmd); + if (rsp->status) { + pr_debug("build_resp: cmd %p, scsi status %d\n", cmd, + (int)rsp->status); + ibmvscsis_determine_resid(se_cmd, rsp); + if (se_cmd->scsi_sense_length && se_cmd->sense_buffer) { + rsp->sense_data_len = + cpu_to_be32(se_cmd->scsi_sense_length); + rsp->flags |= SRP_RSP_FLAG_SNSVALID; + len += se_cmd->scsi_sense_length; + memcpy(data, se_cmd->sense_buffer, + se_cmd->scsi_sense_length); + } + rsp->sol_not = (cmd->rsp.sol_not & UCSOLNT) >> + UCSOLNT_RESP_SHIFT; + } else if (cmd->flags & CMD_FAST_FAIL) { + pr_debug("build_resp: cmd %p, fast fail\n", cmd); + rsp->sol_not = (cmd->rsp.sol_not & UCSOLNT) >> + UCSOLNT_RESP_SHIFT; + } else { + rsp->sol_not = (cmd->rsp.sol_not & SCSOLNT) >> + SCSOLNT_RESP_SHIFT; + } + } else { + /* this is task management */ + rsp->status = 0; + rsp->resp_data_len = cpu_to_be32(4); + rsp->flags |= SRP_RSP_FLAG_RSPVALID; + + switch (se_cmd->se_tmr_req->response) { + case TMR_FUNCTION_COMPLETE: + case TMR_TASK_DOES_NOT_EXIST: + rsp_code = SRP_TASK_MANAGEMENT_FUNCTION_COMPLETE; + rsp->sol_not = (cmd->rsp.sol_not & SCSOLNT) >> + SCSOLNT_RESP_SHIFT; + break; + case TMR_TASK_MGMT_FUNCTION_NOT_SUPPORTED: + case TMR_LUN_DOES_NOT_EXIST: + rsp_code = SRP_TASK_MANAGEMENT_FUNCTION_NOT_SUPPORTED; + rsp->sol_not = (cmd->rsp.sol_not & UCSOLNT) >> + UCSOLNT_RESP_SHIFT; + break; + case TMR_FUNCTION_FAILED: + case TMR_FUNCTION_REJECTED: + default: + rsp_code = SRP_TASK_MANAGEMENT_FUNCTION_FAILED; + rsp->sol_not = (cmd->rsp.sol_not & UCSOLNT) >> + UCSOLNT_RESP_SHIFT; + break; + } + + tsk_status = (u32 *)data; + *tsk_status = cpu_to_be32(rsp_code); + data = (char *)(tsk_status + 1); + len += 4; + } + + dma_wmb(); + rc = h_copy_rdma(len, vscsi->dds.window[LOCAL].liobn, iue->sbuf->dma, + vscsi->dds.window[REMOTE].liobn, + be64_to_cpu(iue->remote_token)); + + switch (rc) { + case H_SUCCESS: + vscsi->credit = 0; + *len_p = len; + break; + case H_PERMISSION: + if (connection_broken(vscsi)) + vscsi->flags |= RESPONSE_Q_DOWN | CLIENT_FAILED; + + dev_err(&vscsi->dev, "build_response: error copying to client, rc %ld, flags 0x%x, state 0x%hx\n", + rc, vscsi->flags, vscsi->state); + break; + case H_SOURCE_PARM: + case H_DEST_PARM: + default: + dev_err(&vscsi->dev, "build_response: error copying to client, rc %ld\n", + rc); + break; + } + + spin_unlock_bh(&vscsi->intr_lock); + + return rc; +} + +static int ibmvscsis_rdma(struct ibmvscsis_cmd *cmd, struct scatterlist *sg, + int nsg, struct srp_direct_buf *md, int nmd, + enum dma_data_direction dir, unsigned int bytes) +{ + struct iu_entry *iue = cmd->iue; + struct srp_target *target = iue->target; + struct scsi_info *vscsi = target->ldata; + struct scatterlist *sgp; + dma_addr_t client_ioba, server_ioba; + ulong buf_len; + ulong client_len, server_len; + int md_idx; + long tx_len; + long rc = 0; + + pr_debug("rdma: dir %d, bytes 0x%x\n", dir, bytes); + + if (bytes == 0) + return 0; + + sgp = sg; + client_len = 0; + server_len = 0; + md_idx = 0; + tx_len = bytes; + + do { + if (client_len == 0) { + if (md_idx >= nmd) { + dev_err(&vscsi->dev, "rdma: ran out of client memory descriptors\n"); + rc = -EIO; + break; + } + client_ioba = be64_to_cpu(md[md_idx].va); + client_len = be32_to_cpu(md[md_idx].len); + } + if (server_len == 0) { + if (!sgp) { + dev_err(&vscsi->dev, "rdma: ran out of scatter/gather list\n"); + rc = -EIO; + break; + } + server_ioba = sg_dma_address(sgp); + server_len = sg_dma_len(sgp); + } + + buf_len = tx_len; + + if (buf_len > client_len) + buf_len = client_len; + + if (buf_len > server_len) + buf_len = server_len; + + if (buf_len > max_vdma_size) + buf_len = max_vdma_size; + + if (dir == DMA_TO_DEVICE) { + /* read from client */ + rc = h_copy_rdma(buf_len, + vscsi->dds.window[REMOTE].liobn, + client_ioba, + vscsi->dds.window[LOCAL].liobn, + server_ioba); + } else { + /* write to client */ + struct srp_cmd *srp = (struct srp_cmd *)iue->sbuf->buf; + + if (!READ_CMD(srp->cdb)) + print_hex_dump_bytes(" data:", DUMP_PREFIX_NONE, + sg_virt(sgp), buf_len); + /* The h_copy_rdma will cause phyp, running in another + * partition, to read memory, so we need to make sure + * the data has been written out, hence these syncs. + */ + /* ensure that everything is in memory */ + isync(); + /* ensure that memory has been made visible */ + dma_wmb(); + rc = h_copy_rdma(buf_len, + vscsi->dds.window[LOCAL].liobn, + server_ioba, + vscsi->dds.window[REMOTE].liobn, + client_ioba); + } + switch (rc) { + case H_SUCCESS: + break; + case H_PERMISSION: + case H_SOURCE_PARM: + case H_DEST_PARM: + if (connection_broken(vscsi)) { + spin_lock_bh(&vscsi->intr_lock); + vscsi->flags |= + (RESPONSE_Q_DOWN | CLIENT_FAILED); + spin_unlock_bh(&vscsi->intr_lock); + } + dev_err(&vscsi->dev, "rdma: h_copy_rdma failed, rc %ld\n", + rc); + break; + + default: + dev_err(&vscsi->dev, "rdma: unknown error %ld from h_copy_rdma\n", + rc); + break; + } + + if (!rc) { + tx_len -= buf_len; + if (tx_len) { + client_len -= buf_len; + if (client_len == 0) + md_idx++; + else + client_ioba += buf_len; + + server_len -= buf_len; + if (server_len == 0) + sgp = sg_next(sgp); + else + server_ioba += buf_len; + } else { + break; + } + } + } while (!rc); + + return rc; +} + +/** + * ibmvscsis_handle_crq() - Handle CRQ + * @data: Pointer to our adapter structure + * + * Read the command elements from the command queue and copy the payloads + * associated with the command elements to local memory and execute the + * SRP requests. + * + * Note: this is an edge triggered interrupt. It can not be shared. + */ +static void ibmvscsis_handle_crq(unsigned long data) +{ + struct scsi_info *vscsi = (struct scsi_info *)data; + struct viosrp_crq *crq; + long rc; + bool ack = true; + volatile u8 valid; + + spin_lock_bh(&vscsi->intr_lock); + + pr_debug("got interrupt\n"); + + /* + * if we are in a path where we are waiting for all pending commands + * to complete because we received a transport event and anything in + * the command queue is for a new connection, do nothing + */ + if (TARGET_STOP(vscsi)) { + vio_enable_interrupts(vscsi->dma_dev); + + pr_debug("handle_crq, don't process: flags 0x%x, state 0x%hx\n", + vscsi->flags, vscsi->state); + spin_unlock_bh(&vscsi->intr_lock); + return; + } + + rc = vscsi->flags & SCHEDULE_DISCONNECT; + crq = vscsi->cmd_q.base_addr + vscsi->cmd_q.index; + valid = crq->valid; + dma_rmb(); + + while (valid) { + /* + * These are edege triggered interrupts. After dropping out of + * the while loop, the code must check for work since an + * interrupt could be lost, and an elment be left on the queue, + * hence the label. + */ +cmd_work: + vscsi->cmd_q.index = + (vscsi->cmd_q.index + 1) & vscsi->cmd_q.mask; + + if (!rc) { + rc = ibmvscsis_parse_command(vscsi, crq); + } else { + if ((uint)crq->valid == VALID_TRANS_EVENT) { + /* + * must service the transport layer events even + * in an error state, dont break out until all + * the consecutive transport events have been + * processed + */ + rc = ibmvscsis_trans_event(vscsi, crq); + } else if (vscsi->flags & TRANS_EVENT) { + /* + * if a tranport event has occurred leave + * everything but transport events on the queue + */ + pr_debug("handle_crq, ignoring\n"); + + /* + * need to decrement the queue index so we can + * look at the elment again + */ + if (vscsi->cmd_q.index) + vscsi->cmd_q.index -= 1; + else + /* + * index is at 0 it just wrapped. + * have it index last element in q + */ + vscsi->cmd_q.index = vscsi->cmd_q.mask; + break; + } + } + + crq->valid = INVALIDATE_CMD_RESP_EL; + + crq = vscsi->cmd_q.base_addr + vscsi->cmd_q.index; + valid = crq->valid; + dma_rmb(); + } + + if (!rc) { + if (ack) { + vio_enable_interrupts(vscsi->dma_dev); + ack = false; + pr_debug("handle_crq, reenabling interrupts\n"); + } + valid = crq->valid; + dma_rmb(); + if (valid) + goto cmd_work; + } else { + pr_debug("handle_crq, error: flags 0x%x, state 0x%hx, crq index 0x%x\n", + vscsi->flags, vscsi->state, vscsi->cmd_q.index); + } + + pr_debug("Leaving handle_crq: schedule_q empty %d, flags 0x%x, state 0x%hx\n", + (int)list_empty(&vscsi->schedule_q), vscsi->flags, + vscsi->state); + + spin_unlock_bh(&vscsi->intr_lock); +} + +static int ibmvscsis_probe(struct vio_dev *vdev, + const struct vio_device_id *id) +{ + struct scsi_info *vscsi; + int rc = 0; + long hrc = 0; + char wq_name[24]; + + vscsi = kzalloc(sizeof(*vscsi), GFP_KERNEL); + if (!vscsi) { + rc = -ENOMEM; + pr_err("probe: allocation of adapter failed\n"); + return rc; + } + + vscsi->dma_dev = vdev; + vscsi->dev = vdev->dev; + INIT_LIST_HEAD(&vscsi->schedule_q); + INIT_LIST_HEAD(&vscsi->waiting_rsp); + INIT_LIST_HEAD(&vscsi->active_q); + + snprintf(vscsi->tport.tport_name, 256, "%s", dev_name(&vdev->dev)); + + pr_debug("probe tport_name: %s\n", vscsi->tport.tport_name); + + rc = read_dma_window(vscsi); + if (rc) + goto free_adapter; + pr_debug("Probe: liobn 0x%x, riobn 0x%x\n", + vscsi->dds.window[LOCAL].liobn, + vscsi->dds.window[REMOTE].liobn); + + strcpy(vscsi->eye, "VSCSI "); + strncat(vscsi->eye, vdev->name, MAX_EYE); + + vscsi->dds.unit_id = vdev->unit_address; + + spin_lock_bh(&ibmvscsis_dev_lock); + list_add_tail(&vscsi->list, &ibmvscsis_dev_list); + spin_unlock_bh(&ibmvscsis_dev_lock); + + /* + * TBD: How do we determine # of cmds to request? Do we know how + * many "children" we have? + */ + vscsi->request_limit = INITIAL_SRP_LIMIT; + rc = srp_target_alloc(&vscsi->target, &vdev->dev, vscsi->request_limit, + SRP_MAX_IU_LEN); + if (rc) + goto rem_list; + + vscsi->target.ldata = vscsi; + + rc = ibmvscsis_alloc_cmds(vscsi, vscsi->request_limit); + if (rc) { + dev_err(&vscsi->dev, "alloc_cmds failed, rc %d, num %d\n", + rc, vscsi->request_limit); + goto free_target; + } + + /* + * Note: the lock is used in freeing timers, so must initialize + * first so that ordering in case of error is correct. + */ + spin_lock_init(&vscsi->intr_lock); + + rc = ibmvscsis_alloctimer(vscsi); + if (rc) { + dev_err(&vscsi->dev, "probe: alloctimer failed, rc %d\n", rc); + goto free_cmds; + } + + rc = ibmvscsis_create_command_q(vscsi, 256); + if (rc) { + dev_err(&vscsi->dev, "probe: create_command_q failed, rc %d\n", + rc); + goto free_timer; + } + + vscsi->map_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!vscsi->map_buf) { + rc = -ENOMEM; + dev_err(&vscsi->dev, "probe: allocating cmd buffer failed\n"); + goto destroy_queue; + } + + vscsi->map_ioba = dma_map_single(&vdev->dev, vscsi->map_buf, PAGE_SIZE, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(&vdev->dev, vscsi->map_ioba)) { + dev_err(&vscsi->dev, "probe: error mapping command buffer\n"); + goto free_buf; + } + + hrc = h_vioctl(vscsi->dds.unit_id, H_GET_PARTNER_INFO, + (u64)vscsi->map_ioba | ((u64)PAGE_SIZE << 32), 0, 0, 0, + 0); + if (hrc == H_SUCCESS) + vscsi->client_data.partition_number = + be64_to_cpu(*(u64 *)vscsi->map_buf); + /* + * We expect the VIOCTL to fail if we're configured as "any + * client can connect" and the client isn't activated yet. + * We'll make the call again when he sends an init msg. + */ + pr_debug("probe hrc %ld, client partition num %d\n", + hrc, vscsi->client_data.partition_number); + + tasklet_init(&vscsi->work_task, ibmvscsis_handle_crq, + (unsigned long)vscsi); + + init_completion(&vscsi->wait_idle); + + snprintf(wq_name, 24, "ibmvscsis%s", dev_name(&vdev->dev)); + vscsi->work_q = create_workqueue(wq_name); + if (!vscsi->work_q) { + rc = -ENOMEM; + dev_err(&vscsi->dev, "create_workqueue failed\n"); + goto unmap_buf; + } + + rc = request_irq(vdev->irq, ibmvscsis_interrupt, 0, "ibmvscsis", vscsi); + if (rc) { + rc = -EPERM; + dev_err(&vscsi->dev, "probe: request_irq failed, rc %d\n", rc); + goto destroy_WQ; + } + + spin_lock_bh(&vscsi->intr_lock); + vio_enable_interrupts(vdev); + if (rc) { + dev_err(&vscsi->dev, "enabling interrupts failed, rc %d\n", rc); + rc = -ENODEV; + spin_unlock_bh(&vscsi->intr_lock); + goto free_irq; + } + + if (ibmvscsis_check_q(vscsi)) { + rc = ERROR; + dev_err(&vscsi->dev, "probe: check_q failed, rc %d\n", rc); + spin_unlock_bh(&vscsi->intr_lock); + goto disable_interrupt; + } + spin_unlock_bh(&vscsi->intr_lock); + + dev_set_drvdata(&vdev->dev, vscsi); + + return 0; + +disable_interrupt: + vio_disable_interrupts(vdev); +free_irq: + free_irq(vdev->irq, vscsi); +destroy_WQ: + destroy_workqueue(vscsi->work_q); +unmap_buf: + dma_unmap_single(&vdev->dev, vscsi->map_ioba, PAGE_SIZE, + DMA_BIDIRECTIONAL); +free_buf: + kfree(vscsi->map_buf); +destroy_queue: + tasklet_kill(&vscsi->work_task); + ibmvscsis_unregister_command_q(vscsi); + ibmvscsis_destroy_command_q(vscsi); +free_timer: + ibmvscsis_freetimer(vscsi); +free_cmds: + ibmvscsis_free_cmds(vscsi); +free_target: + srp_target_free(&vscsi->target); +rem_list: + spin_lock_bh(&ibmvscsis_dev_lock); + list_del(&vscsi->list); + spin_unlock_bh(&ibmvscsis_dev_lock); +free_adapter: + kfree(vscsi); + + return rc; +} + +static int ibmvscsis_remove(struct vio_dev *vdev) +{ + struct scsi_info *vscsi = dev_get_drvdata(&vdev->dev); + + pr_debug("remove (%s)\n", dev_name(&vscsi->dma_dev->dev)); + + /* + * TBD: Need to handle if there are commands on the waiting_rsp q + * Actually, can there still be cmds outstanding to tcm? + */ + + vio_disable_interrupts(vdev); + free_irq(vdev->irq, vscsi); + destroy_workqueue(vscsi->work_q); + dma_unmap_single(&vdev->dev, vscsi->map_ioba, PAGE_SIZE, + DMA_BIDIRECTIONAL); + kfree(vscsi->map_buf); + tasklet_kill(&vscsi->work_task); + ibmvscsis_unregister_command_q(vscsi); + ibmvscsis_destroy_command_q(vscsi); + ibmvscsis_freetimer(vscsi); + ibmvscsis_free_cmds(vscsi); + srp_target_free(&vscsi->target); + spin_lock_bh(&ibmvscsis_dev_lock); + list_del(&vscsi->list); + spin_unlock_bh(&ibmvscsis_dev_lock); + kfree(vscsi); + + return 0; +} + +static ssize_t system_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", system_id); +} + +static ssize_t partition_number_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%x\n", partition_number); +} + +static ssize_t unit_address_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct scsi_info *vscsi = container_of(dev, struct scsi_info, dev); + + return snprintf(buf, PAGE_SIZE, "%x\n", vscsi->dma_dev->unit_address); +} + +static int ibmvscsis_get_system_info(void) +{ + struct device_node *rootdn, *vdevdn; + const char *id, *model, *name; + const uint *num; + + rootdn = of_find_node_by_path("/"); + if (!rootdn) + return -ENOENT; + + model = of_get_property(rootdn, "model", NULL); + id = of_get_property(rootdn, "system-id", NULL); + if (model && id) + snprintf(system_id, sizeof(system_id), "%s-%s", model, id); + + name = of_get_property(rootdn, "ibm,partition-name", NULL); + if (name) + strncpy(partition_name, name, sizeof(partition_name)); + + num = of_get_property(rootdn, "ibm,partition-no", NULL); + if (num) + partition_number = *num; + + of_node_put(rootdn); + + vdevdn = of_find_node_by_path("/vdevice"); + if (vdevdn) { + const uint *mvds; + + mvds = of_get_property(vdevdn, "ibm,max-virtual-dma-size", + NULL); + if (mvds) + max_vdma_size = *mvds; + of_node_put(vdevdn); + } + + return 0; +} + +static char *ibmvscsis_get_fabric_name(void) +{ + return "ibmvscsis"; +} + +static char *ibmvscsis_get_fabric_wwn(struct se_portal_group *se_tpg) +{ + struct ibmvscsis_tport *tport = + container_of(se_tpg, struct ibmvscsis_tport, se_tpg); + + return tport->tport_name; +} + +static u16 ibmvscsis_get_tag(struct se_portal_group *se_tpg) +{ + struct ibmvscsis_tport *tport = + container_of(se_tpg, struct ibmvscsis_tport, se_tpg); + + return tport->tport_tpgt; +} + +static u32 ibmvscsis_get_default_depth(struct se_portal_group *se_tpg) +{ + return 1; +} + +static int ibmvscsis_check_true(struct se_portal_group *se_tpg) +{ + return 1; +} + +static int ibmvscsis_check_false(struct se_portal_group *se_tpg) +{ + return 0; +} + +static u32 ibmvscsis_tpg_get_inst_index(struct se_portal_group *se_tpg) +{ + return 1; +} + +static int ibmvscsis_check_stop_free(struct se_cmd *se_cmd) +{ + return target_put_sess_cmd(se_cmd); +} + +static void ibmvscsis_release_cmd(struct se_cmd *se_cmd) +{ + struct ibmvscsis_cmd *cmd = container_of(se_cmd, struct ibmvscsis_cmd, + se_cmd); + struct scsi_info *vscsi = cmd->adapter; + + pr_debug("release_cmd %p, flags %d\n", se_cmd, cmd->flags); + + spin_lock_bh(&vscsi->intr_lock); + /* Remove from active_q */ + list_del(&cmd->list); + list_add_tail(&cmd->list, &vscsi->waiting_rsp); + ibmvscsis_send_messages(vscsi); + spin_unlock_bh(&vscsi->intr_lock); +} + +static u32 ibmvscsis_sess_get_index(struct se_session *se_sess) +{ + return 0; +} + +static int ibmvscsis_write_pending(struct se_cmd *se_cmd) +{ + struct ibmvscsis_cmd *cmd = container_of(se_cmd, struct ibmvscsis_cmd, + se_cmd); + struct iu_entry *iue = cmd->iue; + int rc; + + pr_debug("write_pending, se_cmd %p, length 0x%x\n", + se_cmd, se_cmd->data_length); + + rc = srp_transfer_data(cmd, &vio_iu(iue)->srp.cmd, ibmvscsis_rdma, + 1, 1); + if (rc) { + pr_err("srp_transfer_data() failed: %d\n", rc); + return -EAGAIN; + } + /* + * We now tell TCM to add this WRITE CDB directly into the TCM storage + * object execution queue. + */ + target_execute_cmd(se_cmd); + return 0; +} + +static int ibmvscsis_write_pending_status(struct se_cmd *se_cmd) +{ + return 0; +} + +static void ibmvscsis_set_default_node_attrs(struct se_node_acl *nacl) +{ +} + +static int ibmvscsis_get_cmd_state(struct se_cmd *se_cmd) +{ + return 0; +} + +static int ibmvscsis_queue_data_in(struct se_cmd *se_cmd) +{ + struct ibmvscsis_cmd *cmd = container_of(se_cmd, struct ibmvscsis_cmd, + se_cmd); + struct iu_entry *iue = cmd->iue; + struct scsi_info *vscsi = cmd->adapter; + char *sd; + uint len = 0; + int rc; + + pr_debug("queue_data_in, se_cmd %p, length 0x%x\n", + se_cmd, se_cmd->data_length); + + rc = srp_transfer_data(cmd, &vio_iu(iue)->srp.cmd, ibmvscsis_rdma, 1, + 1); + if (rc) { + pr_err("srp_transfer_data failed: %d\n", rc); + sd = se_cmd->sense_buffer; + se_cmd->scsi_sense_length = 18; + memset(se_cmd->sense_buffer, 0, se_cmd->scsi_sense_length); + /* Logical Unit Communication Time-out asc/ascq = 0x0801 */ + scsi_build_sense_buffer(0, se_cmd->sense_buffer, MEDIUM_ERROR, + 0x08, 0x01); + } + + srp_build_response(vscsi, cmd, &len); + cmd->rsp.format = SRP_FORMAT; + cmd->rsp.len = len; + + return 0; +} + +static int ibmvscsis_queue_status(struct se_cmd *se_cmd) +{ + struct ibmvscsis_cmd *cmd = container_of(se_cmd, struct ibmvscsis_cmd, + se_cmd); + struct scsi_info *vscsi = cmd->adapter; + uint len; + + pr_debug("queue_status %p\n", se_cmd); + + srp_build_response(vscsi, cmd, &len); + cmd->rsp.format = SRP_FORMAT; + cmd->rsp.len = len; + + return 0; +} + +static void ibmvscsis_queue_tm_rsp(struct se_cmd *se_cmd) +{ + struct ibmvscsis_cmd *cmd = container_of(se_cmd, struct ibmvscsis_cmd, + se_cmd); + struct scsi_info *vscsi = cmd->adapter; + uint len; + + pr_debug("queue_tm_rsp %p, status %d\n", + se_cmd, (int)se_cmd->se_tmr_req->response); + + srp_build_response(vscsi, cmd, &len); + cmd->rsp.format = SRP_FORMAT; + cmd->rsp.len = len; +} + +static void ibmvscsis_aborted_task(struct se_cmd *se_cmd) +{ + /* TBD: What (if anything) should we do here? */ + pr_debug("ibmvscsis_aborted_task %p\n", se_cmd); +} + +static struct se_wwn *ibmvscsis_make_tport(struct target_fabric_configfs *tf, + struct config_group *group, + const char *name) +{ + struct ibmvscsis_tport *tport; + + tport = ibmvscsis_lookup_port(name); + if (tport) { + tport->tport_proto_id = SCSI_PROTOCOL_SRP; + pr_debug("make_tport(%s), pointer:%p, tport_id:%x\n", + name, tport, tport->tport_proto_id); + return &tport->tport_wwn; + } + + return ERR_PTR(-EINVAL); +} + +static void ibmvscsis_drop_tport(struct se_wwn *wwn) +{ + struct ibmvscsis_tport *tport = container_of(wwn, + struct ibmvscsis_tport, + tport_wwn); + + pr_debug("drop_tport(%s)\n", + config_item_name(&tport->tport_wwn.wwn_group.cg_item)); +} + +static struct se_portal_group *ibmvscsis_make_tpg(struct se_wwn *wwn, + struct config_group *group, + const char *name) +{ + struct ibmvscsis_tport *tport = + container_of(wwn, struct ibmvscsis_tport, tport_wwn); + int rc; + + tport->releasing = false; + + rc = core_tpg_register(&tport->tport_wwn, &tport->se_tpg, + tport->tport_proto_id); + if (rc) + return ERR_PTR(rc); + + return &tport->se_tpg; +} + +static void ibmvscsis_drop_tpg(struct se_portal_group *se_tpg) +{ + struct ibmvscsis_tport *tport = container_of(se_tpg, + struct ibmvscsis_tport, + se_tpg); + + tport->releasing = true; + tport->enabled = false; + + /* + * Release the virtual I_T Nexus for this ibmvscsis TPG + */ + ibmvscsis_drop_nexus(tport); + /* + * Deregister the se_tpg from TCM.. + */ + core_tpg_deregister(se_tpg); +} + +static ssize_t ibmvscsis_wwn_version_show(struct config_item *item, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "%s\n", IBMVSCSIS_VERSION); +} +CONFIGFS_ATTR_RO(ibmvscsis_wwn_, version); + +static struct configfs_attribute *ibmvscsis_wwn_attrs[] = { + &ibmvscsis_wwn_attr_version, + NULL, +}; + +static ssize_t ibmvscsis_tpg_enable_show(struct config_item *item, + char *page) +{ + struct se_portal_group *se_tpg = to_tpg(item); + struct ibmvscsis_tport *tport = container_of(se_tpg, + struct ibmvscsis_tport, + se_tpg); + + return snprintf(page, PAGE_SIZE, "%d\n", (tport->enabled) ? 1 : 0); +} + +static ssize_t ibmvscsis_tpg_enable_store(struct config_item *item, + const char *page, size_t count) +{ + struct se_portal_group *se_tpg = to_tpg(item); + struct ibmvscsis_tport *tport = container_of(se_tpg, + struct ibmvscsis_tport, + se_tpg); + struct scsi_info *vscsi = container_of(tport, struct scsi_info, tport); + unsigned long tmp; + int rc; + long lrc; + + rc = kstrtoul(page, 0, &tmp); + if (rc < 0) { + pr_err("Unable to extract srpt_tpg_store_enable\n"); + return -EINVAL; + } + + if ((tmp != 0) && (tmp != 1)) { + pr_err("Illegal value for srpt_tpg_store_enable\n"); + return -EINVAL; + } + + if (tmp) { + tport->enabled = true; + spin_lock_bh(&vscsi->intr_lock); + lrc = ibmvscsis_enable_change_state(vscsi); + if (lrc) + pr_err("enable_change_state failed, rc %ld state %d\n", + lrc, vscsi->state); + spin_unlock_bh(&vscsi->intr_lock); + } else { + tport->enabled = false; + } + + pr_debug("tpg_enable_store, state %d\n", vscsi->state); + + return count; +} +CONFIGFS_ATTR(ibmvscsis_tpg_, enable); + +static struct configfs_attribute *ibmvscsis_tpg_attrs[] = { + &ibmvscsis_tpg_attr_enable, + NULL, +}; + +static const struct target_core_fabric_ops ibmvscsis_ops = { + .module = THIS_MODULE, + .name = "ibmvscsis", + .get_fabric_name = ibmvscsis_get_fabric_name, + .tpg_get_wwn = ibmvscsis_get_fabric_wwn, + .tpg_get_tag = ibmvscsis_get_tag, + .tpg_get_default_depth = ibmvscsis_get_default_depth, + .tpg_check_demo_mode = ibmvscsis_check_true, + .tpg_check_demo_mode_cache = ibmvscsis_check_true, + .tpg_check_demo_mode_write_protect = ibmvscsis_check_false, + .tpg_check_prod_mode_write_protect = ibmvscsis_check_false, + .tpg_get_inst_index = ibmvscsis_tpg_get_inst_index, + .check_stop_free = ibmvscsis_check_stop_free, + .release_cmd = ibmvscsis_release_cmd, + .sess_get_index = ibmvscsis_sess_get_index, + .write_pending = ibmvscsis_write_pending, + .write_pending_status = ibmvscsis_write_pending_status, + .set_default_node_attributes = ibmvscsis_set_default_node_attrs, + .get_cmd_state = ibmvscsis_get_cmd_state, + .queue_data_in = ibmvscsis_queue_data_in, + .queue_status = ibmvscsis_queue_status, + .queue_tm_rsp = ibmvscsis_queue_tm_rsp, + .aborted_task = ibmvscsis_aborted_task, + /* + * Setup function pointers for logic in target_core_fabric_configfs.c + */ + .fabric_make_wwn = ibmvscsis_make_tport, + .fabric_drop_wwn = ibmvscsis_drop_tport, + .fabric_make_tpg = ibmvscsis_make_tpg, + .fabric_drop_tpg = ibmvscsis_drop_tpg, + + .tfc_wwn_attrs = ibmvscsis_wwn_attrs, + .tfc_tpg_base_attrs = ibmvscsis_tpg_attrs, +}; + +static void ibmvscsis_dev_release(struct device *dev) {}; + +static struct class_attribute ibmvscsis_class_attrs[] = { + __ATTR_NULL, +}; + +static struct device_attribute dev_attr_system_id = + __ATTR(system_id, S_IRUGO, system_id_show, NULL); + +static struct device_attribute dev_attr_partition_number = + __ATTR(partition_number, S_IRUGO, partition_number_show, NULL); + +static struct device_attribute dev_attr_unit_address = + __ATTR(unit_address, S_IRUGO, unit_address_show, NULL); + +static struct attribute *ibmvscsis_dev_attrs[] = { + &dev_attr_system_id.attr, + &dev_attr_partition_number.attr, + &dev_attr_unit_address.attr, +}; +ATTRIBUTE_GROUPS(ibmvscsis_dev); + +static struct class ibmvscsis_class = { + .name = "ibmvscsis", + .dev_release = ibmvscsis_dev_release, + .class_attrs = ibmvscsis_class_attrs, + .dev_groups = ibmvscsis_dev_groups, +}; + +static struct vio_device_id ibmvscsis_device_table[] = { + { "v-scsi-host", "IBM,v-scsi-host" }, + { "", "" } +}; +MODULE_DEVICE_TABLE(vio, ibmvscsis_device_table); + +static struct vio_driver ibmvscsis_driver = { + .name = "ibmvscsis", + .id_table = ibmvscsis_device_table, + .probe = ibmvscsis_probe, + .remove = ibmvscsis_remove, +}; + +/* + * ibmvscsis_init() - Kernel Module initialization + * + * Note: vio_register_driver() registers callback functions, and at least one + * of those callback functions calls TCM - Linux IO Target Subsystem, thus + * the SCSI Target template must be registered before vio_register_driver() + * is called. + */ +static int __init ibmvscsis_init(void) +{ + int rc = 0; + + rc = ibmvscsis_get_system_info(); + if (rc) { + pr_err("rc %d from get_system_info\n", rc); + goto out; + } + + rc = class_register(&ibmvscsis_class); + if (rc) { + pr_err("failed class register\n"); + goto out; + } + + rc = target_register_template(&ibmvscsis_ops); + if (rc) { + pr_err("rc %d from target_register_template\n", rc); + goto unregister_class; + } + + rc = vio_register_driver(&ibmvscsis_driver); + if (rc) { + pr_err("rc %d from vio_register_driver\n", rc); + goto unregister_target; + } + + return 0; + +unregister_target: + target_unregister_template(&ibmvscsis_ops); +unregister_class: + class_unregister(&ibmvscsis_class); +out: + return rc; +} + +static void __exit ibmvscsis_exit(void) +{ + pr_info("Unregister IBM virtual SCSI host driver\n"); + vio_unregister_driver(&ibmvscsis_driver); + target_unregister_template(&ibmvscsis_ops); + class_unregister(&ibmvscsis_class); +} + +MODULE_DESCRIPTION("IBMVSCSIS fabric driver"); +MODULE_AUTHOR("Bryant G. Ly and Michael Cyr"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(IBMVSCSIS_VERSION); +module_init(ibmvscsis_init); +module_exit(ibmvscsis_exit); diff --git a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.h b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.h new file mode 100644 index 000000000000..981a0c992b6c --- /dev/null +++ b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.h @@ -0,0 +1,346 @@ +/******************************************************************************* + * IBM Virtual SCSI Target Driver + * Copyright (C) 2003-2005 Dave Boutcher (boutcher@us.ibm.com) IBM Corp. + * Santiago Leon (santil@us.ibm.com) IBM Corp. + * Linda Xie (lxie@us.ibm.com) IBM Corp. + * + * Copyright (C) 2005-2011 FUJITA Tomonori <tomof@acm.org> + * Copyright (C) 2010 Nicholas A. Bellinger <nab@kernel.org> + * Copyright (C) 2016 Bryant G. Ly <bryantly@linux.vnet.ibm.com> IBM Corp. + * + * Authors: Bryant G. Ly <bryantly@linux.vnet.ibm.com> + * Authors: Michael Cyr <mikecyr@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + ****************************************************************************/ + +#ifndef __H_IBMVSCSI_TGT +#define __H_IBMVSCSI_TGT + +#include "libsrp.h" + +#define SYS_ID_NAME_LEN 64 +#define PARTITION_NAMELEN 96 +#define IBMVSCSIS_NAMELEN 32 + +#define MSG_HI 0 +#define MSG_LOW 1 + +#define MAX_CMD_Q_PAGES 4 +#define CRQ_PER_PAGE (PAGE_SIZE / sizeof(struct viosrp_crq)) +/* in terms of number of elements */ +#define DEFAULT_CMD_Q_SIZE CRQ_PER_PAGE +#define MAX_CMD_Q_SIZE (DEFAULT_CMD_Q_SIZE * MAX_CMD_Q_PAGES) + +#define SRP_VIOLATION 0x102 /* general error code */ + +/* + * SRP buffer formats defined as of 16.a supported by this driver. + */ +#define SUPPORTED_FORMATS ((SRP_DATA_DESC_DIRECT << 1) | \ + (SRP_DATA_DESC_INDIRECT << 1)) + +#define SCSI_LUN_ADDR_METHOD_FLAT 1 + +struct dma_window { + u32 liobn; /* Unique per vdevice */ + u64 tce_base; /* Physical location of the TCE table */ + u64 tce_size; /* Size of the TCE table in bytes */ +}; + +struct target_dds { + u64 unit_id; /* 64 bit will force alignment */ +#define NUM_DMA_WINDOWS 2 +#define LOCAL 0 +#define REMOTE 1 + struct dma_window window[NUM_DMA_WINDOWS]; + + /* root node property "ibm,partition-no" */ + uint partition_num; + char partition_name[PARTITION_NAMELEN]; +}; + +#define MAX_NUM_PORTS 1 +#define MAX_H_COPY_RDMA (128 * 1024) + +#define MAX_EYE 64 + +/* Return codes */ +#define ADAPT_SUCCESS 0L +/* choose error codes that do not conflict with PHYP */ +#define ERROR -40L + +struct format_code { + u8 reserved; + u8 buffers; +}; + +struct client_info { +#define SRP_VERSION "16.a" + char srp_version[8]; + /* root node property ibm,partition-name */ + char partition_name[PARTITION_NAMELEN]; + /* root node property ibm,partition-no */ + u32 partition_number; + /* initially 1 */ + u32 mad_version; + u32 os_type; +}; + +/* + * Changing this constant changes the number of seconds to wait before + * considering the client will never service its queue again. + */ +#define SECONDS_TO_CONSIDER_FAILED 30 +/* + * These constants set the polling period used to determine if the client + * has freed at least one element in the response queue. + */ +#define WAIT_SECONDS 1 +#define WAIT_NANO_SECONDS 5000 +#define MAX_TIMER_POPS ((1000000 / WAIT_NANO_SECONDS) * \ + SECONDS_TO_CONSIDER_FAILED) +/* + * general purpose timer control block + * which can be used for multiple functions + */ +struct timer_cb { + struct hrtimer timer; + /* + * how long has it been since the client + * serviced the queue. The variable is incrmented + * in the service_wait_q routine and cleared + * in send messages + */ + int timer_pops; + /* the timer is started */ + bool started; +}; + +struct cmd_queue { + /* kva */ + struct viosrp_crq *base_addr; + dma_addr_t crq_token; + /* used to maintain index */ + uint mask; + /* current element */ + uint index; + int size; +}; + +#define SCSOLNT_RESP_SHIFT 1 +#define UCSOLNT_RESP_SHIFT 2 + +#define SCSOLNT BIT(SCSOLNT_RESP_SHIFT) +#define UCSOLNT BIT(UCSOLNT_RESP_SHIFT) + +enum cmd_type { + SCSI_CDB = 0x01, + TASK_MANAGEMENT = 0x02, + /* MAD or addressed to port 0 */ + ADAPTER_MAD = 0x04, + UNSET_TYPE = 0x08, +}; + +struct iu_rsp { + u8 format; + u8 sol_not; + u16 len; + /* tag is just to help client identify cmd, so don't translate be/le */ + u64 tag; +}; + +struct ibmvscsis_cmd { + struct list_head list; + /* Used for TCM Core operations */ + struct se_cmd se_cmd; + struct iu_entry *iue; + struct iu_rsp rsp; + struct work_struct work; + struct scsi_info *adapter; + /* Sense buffer that will be mapped into outgoing status */ + unsigned char sense_buf[TRANSPORT_SENSE_BUFFER]; + u64 init_time; +#define CMD_FAST_FAIL BIT(0) + u32 flags; + char type; +}; + +struct ibmvscsis_nexus { + struct se_session *se_sess; +}; + +struct ibmvscsis_tport { + /* SCSI protocol the tport is providing */ + u8 tport_proto_id; + /* ASCII formatted WWPN for SRP Target port */ + char tport_name[IBMVSCSIS_NAMELEN]; + /* Returned by ibmvscsis_make_tport() */ + struct se_wwn tport_wwn; + /* Returned by ibmvscsis_make_tpg() */ + struct se_portal_group se_tpg; + /* ibmvscsis port target portal group tag for TCM */ + u16 tport_tpgt; + /* Pointer to TCM session for I_T Nexus */ + struct ibmvscsis_nexus *ibmv_nexus; + bool enabled; + bool releasing; +}; + +struct scsi_info { + struct list_head list; + char eye[MAX_EYE]; + + /* commands waiting for space on repsonse queue */ + struct list_head waiting_rsp; +#define NO_QUEUE 0x00 +#define WAIT_ENABLED 0X01 + /* driver has received an initialize command */ +#define PART_UP_WAIT_ENAB 0x02 +#define WAIT_CONNECTION 0x04 + /* have established a connection */ +#define CONNECTED 0x08 + /* at least one port is processing SRP IU */ +#define SRP_PROCESSING 0x10 + /* remove request received */ +#define UNCONFIGURING 0x20 + /* disconnect by letting adapter go idle, no error */ +#define WAIT_IDLE 0x40 + /* disconnecting to clear an error */ +#define ERR_DISCONNECT 0x80 + /* disconnect to clear error state, then come back up */ +#define ERR_DISCONNECT_RECONNECT 0x100 + /* disconnected after clearing an error */ +#define ERR_DISCONNECTED 0x200 + /* A series of errors caused unexpected errors */ +#define UNDEFINED 0x400 + u16 state; + int fast_fail; + struct target_dds dds; + char *cmd_pool; + /* list of free commands */ + struct list_head free_cmd; + /* command elements ready for scheduler */ + struct list_head schedule_q; + /* commands sent to TCM */ + struct list_head active_q; + caddr_t *map_buf; + /* ioba of map buffer */ + dma_addr_t map_ioba; + /* allowable number of outstanding SRP requests */ + int request_limit; + /* extra credit */ + int credit; + /* outstanding transactions against credit limit */ + int debit; + + /* allow only one outstanding mad request */ +#define PROCESSING_MAD 0x00002 + /* Waiting to go idle */ +#define WAIT_FOR_IDLE 0x00004 + /* H_REG_CRQ called */ +#define CRQ_CLOSED 0x00010 + /* detected that client has failed */ +#define CLIENT_FAILED 0x00040 + /* detected that transport event occurred */ +#define TRANS_EVENT 0x00080 + /* don't attempt to send anything to the client */ +#define RESPONSE_Q_DOWN 0x00100 + /* request made to schedule disconnect handler */ +#define SCHEDULE_DISCONNECT 0x00400 + /* disconnect handler is scheduled */ +#define DISCONNECT_SCHEDULED 0x00800 + u32 flags; + /* adapter lock */ + spinlock_t intr_lock; + /* information needed to manage command queue */ + struct cmd_queue cmd_q; + /* used in hcall to copy response back into srp buffer */ + u64 empty_iu_id; + /* used in crq, to tag what iu the response is for */ + u64 empty_iu_tag; + uint new_state; + /* control block for the response queue timer */ + struct timer_cb rsp_q_timer; + /* keep last client to enable proper accounting */ + struct client_info client_data; + /* what can this client do */ + u32 client_cap; + /* + * The following two fields capture state and flag changes that + * can occur when the lock is given up. In the orginal design, + * the lock was held during calls into phyp; + * however, phyp did not meet PAPR architecture. This is + * a work around. + */ + u16 phyp_acr_state; + u32 phyp_acr_flags; + + struct workqueue_struct *work_q; + struct completion wait_idle; + struct device dev; + struct vio_dev *dma_dev; + struct srp_target target; + struct ibmvscsis_tport tport; + struct tasklet_struct work_task; + struct work_struct proc_work; +}; + +/* + * Provide a constant that allows software to detect the adapter is + * disconnecting from the client from one of several states. + */ +#define IS_DISCONNECTING (UNCONFIGURING | ERR_DISCONNECT_RECONNECT | \ + ERR_DISCONNECT) + +/* + * Provide a constant that can be used with interrupt handling that + * essentially lets the interrupt handler know that all requests should + * be thrown out, + */ +#define DONT_PROCESS_STATE (IS_DISCONNECTING | UNDEFINED | \ + ERR_DISCONNECTED | WAIT_IDLE) + +/* + * If any of these flag bits are set then do not allow the interrupt + * handler to schedule the off level handler. + */ +#define BLOCK (DISCONNECT_SCHEDULED) + +/* State and transition events that stop the interrupt handler */ +#define TARGET_STOP(VSCSI) (long)(((VSCSI)->state & DONT_PROCESS_STATE) | \ + ((VSCSI)->flags & BLOCK)) + +/* flag bit that are not reset during disconnect */ +#define PRESERVE_FLAG_FIELDS 0 + +#define vio_iu(IUE) ((union viosrp_iu *)((IUE)->sbuf->buf)) + +#define READ_CMD(cdb) (((cdb)[0] & 0x1F) == 8) +#define WRITE_CMD(cdb) (((cdb)[0] & 0x1F) == 0xA) + +#ifndef H_GET_PARTNER_INFO +#define H_GET_PARTNER_INFO 0x0000000000000008LL +#endif + +#define h_copy_rdma(l, sa, sb, da, db) \ + plpar_hcall_norets(H_COPY_RDMA, l, sa, sb, da, db) +#define h_vioctl(u, o, a, u1, u2, u3, u4) \ + plpar_hcall_norets(H_VIOCTL, u, o, a, u1, u2) +#define h_reg_crq(ua, tok, sz) \ + plpar_hcall_norets(H_REG_CRQ, ua, tok, sz) +#define h_free_crq(ua) \ + plpar_hcall_norets(H_FREE_CRQ, ua) +#define h_send_crq(ua, d1, d2) \ + plpar_hcall_norets(H_SEND_CRQ, ua, d1, d2) + +#endif diff --git a/drivers/scsi/ibmvscsi_tgt/libsrp.c b/drivers/scsi/ibmvscsi_tgt/libsrp.c new file mode 100644 index 000000000000..5a4cc28ca5ff --- /dev/null +++ b/drivers/scsi/ibmvscsi_tgt/libsrp.c @@ -0,0 +1,427 @@ +/******************************************************************************* + * SCSI RDMA Protocol lib functions + * + * Copyright (C) 2006 FUJITA Tomonori <tomof@acm.org> + * Copyright (C) 2016 Bryant G. Ly <bryantly@linux.vnet.ibm.com> IBM Corp. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + ***********************************************************************/ + +#define pr_fmt(fmt) "libsrp: " fmt + +#include <linux/printk.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/kfifo.h> +#include <linux/scatterlist.h> +#include <linux/dma-mapping.h> +#include <linux/module.h> +#include <scsi/srp.h> +#include <target/target_core_base.h> +#include "libsrp.h" +#include "ibmvscsi_tgt.h" + +static int srp_iu_pool_alloc(struct srp_queue *q, size_t max, + struct srp_buf **ring) +{ + struct iu_entry *iue; + int i; + + q->pool = kcalloc(max, sizeof(struct iu_entry *), GFP_KERNEL); + if (!q->pool) + return -ENOMEM; + q->items = kcalloc(max, sizeof(struct iu_entry), GFP_KERNEL); + if (!q->items) + goto free_pool; + + spin_lock_init(&q->lock); + kfifo_init(&q->queue, (void *)q->pool, max * sizeof(void *)); + + for (i = 0, iue = q->items; i < max; i++) { + kfifo_in(&q->queue, (void *)&iue, sizeof(void *)); + iue->sbuf = ring[i]; + iue++; + } + return 0; + +free_pool: + kfree(q->pool); + return -ENOMEM; +} + +static void srp_iu_pool_free(struct srp_queue *q) +{ + kfree(q->items); + kfree(q->pool); +} + +static struct srp_buf **srp_ring_alloc(struct device *dev, + size_t max, size_t size) +{ + struct srp_buf **ring; + int i; + + ring = kcalloc(max, sizeof(struct srp_buf *), GFP_KERNEL); + if (!ring) + return NULL; + + for (i = 0; i < max; i++) { + ring[i] = kzalloc(sizeof(*ring[i]), GFP_KERNEL); + if (!ring[i]) + goto out; + ring[i]->buf = dma_alloc_coherent(dev, size, &ring[i]->dma, + GFP_KERNEL); + if (!ring[i]->buf) + goto out; + } + return ring; + +out: + for (i = 0; i < max && ring[i]; i++) { + if (ring[i]->buf) { + dma_free_coherent(dev, size, ring[i]->buf, + ring[i]->dma); + } + kfree(ring[i]); + } + kfree(ring); + + return NULL; +} + +static void srp_ring_free(struct device *dev, struct srp_buf **ring, + size_t max, size_t size) +{ + int i; + + for (i = 0; i < max; i++) { + dma_free_coherent(dev, size, ring[i]->buf, ring[i]->dma); + kfree(ring[i]); + } + kfree(ring); +} + +int srp_target_alloc(struct srp_target *target, struct device *dev, + size_t nr, size_t iu_size) +{ + int err; + + spin_lock_init(&target->lock); + + target->dev = dev; + + target->srp_iu_size = iu_size; + target->rx_ring_size = nr; + target->rx_ring = srp_ring_alloc(target->dev, nr, iu_size); + if (!target->rx_ring) + return -ENOMEM; + err = srp_iu_pool_alloc(&target->iu_queue, nr, target->rx_ring); + if (err) + goto free_ring; + + dev_set_drvdata(target->dev, target); + return 0; + +free_ring: + srp_ring_free(target->dev, target->rx_ring, nr, iu_size); + return -ENOMEM; +} + +void srp_target_free(struct srp_target *target) +{ + dev_set_drvdata(target->dev, NULL); + srp_ring_free(target->dev, target->rx_ring, target->rx_ring_size, + target->srp_iu_size); + srp_iu_pool_free(&target->iu_queue); +} + +struct iu_entry *srp_iu_get(struct srp_target *target) +{ + struct iu_entry *iue = NULL; + + if (kfifo_out_locked(&target->iu_queue.queue, (void *)&iue, + sizeof(void *), + &target->iu_queue.lock) != sizeof(void *)) { + WARN_ONCE(1, "unexpected fifo state"); + return NULL; + } + if (!iue) + return iue; + iue->target = target; + iue->flags = 0; + return iue; +} + +void srp_iu_put(struct iu_entry *iue) +{ + kfifo_in_locked(&iue->target->iu_queue.queue, (void *)&iue, + sizeof(void *), &iue->target->iu_queue.lock); +} + +static int srp_direct_data(struct ibmvscsis_cmd *cmd, struct srp_direct_buf *md, + enum dma_data_direction dir, srp_rdma_t rdma_io, + int dma_map, int ext_desc) +{ + struct iu_entry *iue = NULL; + struct scatterlist *sg = NULL; + int err, nsg = 0, len; + + if (dma_map) { + iue = cmd->iue; + sg = cmd->se_cmd.t_data_sg; + nsg = dma_map_sg(iue->target->dev, sg, cmd->se_cmd.t_data_nents, + DMA_BIDIRECTIONAL); + if (!nsg) { + pr_err("fail to map %p %d\n", iue, + cmd->se_cmd.t_data_nents); + return 0; + } + len = min(cmd->se_cmd.data_length, be32_to_cpu(md->len)); + } else { + len = be32_to_cpu(md->len); + } + + err = rdma_io(cmd, sg, nsg, md, 1, dir, len); + + if (dma_map) + dma_unmap_sg(iue->target->dev, sg, nsg, DMA_BIDIRECTIONAL); + + return err; +} + +static int srp_indirect_data(struct ibmvscsis_cmd *cmd, struct srp_cmd *srp_cmd, + struct srp_indirect_buf *id, + enum dma_data_direction dir, srp_rdma_t rdma_io, + int dma_map, int ext_desc) +{ + struct iu_entry *iue = NULL; + struct srp_direct_buf *md = NULL; + struct scatterlist dummy, *sg = NULL; + dma_addr_t token = 0; + int err = 0; + int nmd, nsg = 0, len; + + if (dma_map || ext_desc) { + iue = cmd->iue; + sg = cmd->se_cmd.t_data_sg; + } + + nmd = be32_to_cpu(id->table_desc.len) / sizeof(struct srp_direct_buf); + + if ((dir == DMA_FROM_DEVICE && nmd == srp_cmd->data_in_desc_cnt) || + (dir == DMA_TO_DEVICE && nmd == srp_cmd->data_out_desc_cnt)) { + md = &id->desc_list[0]; + goto rdma; + } + + if (ext_desc && dma_map) { + md = dma_alloc_coherent(iue->target->dev, + be32_to_cpu(id->table_desc.len), + &token, GFP_KERNEL); + if (!md) { + pr_err("Can't get dma memory %u\n", + be32_to_cpu(id->table_desc.len)); + return -ENOMEM; + } + + sg_init_one(&dummy, md, be32_to_cpu(id->table_desc.len)); + sg_dma_address(&dummy) = token; + sg_dma_len(&dummy) = be32_to_cpu(id->table_desc.len); + err = rdma_io(cmd, &dummy, 1, &id->table_desc, 1, DMA_TO_DEVICE, + be32_to_cpu(id->table_desc.len)); + if (err) { + pr_err("Error copying indirect table %d\n", err); + goto free_mem; + } + } else { + pr_err("This command uses external indirect buffer\n"); + return -EINVAL; + } + +rdma: + if (dma_map) { + nsg = dma_map_sg(iue->target->dev, sg, cmd->se_cmd.t_data_nents, + DMA_BIDIRECTIONAL); + if (!nsg) { + pr_err("fail to map %p %d\n", iue, + cmd->se_cmd.t_data_nents); + err = -EIO; + goto free_mem; + } + len = min(cmd->se_cmd.data_length, be32_to_cpu(id->len)); + } else { + len = be32_to_cpu(id->len); + } + + err = rdma_io(cmd, sg, nsg, md, nmd, dir, len); + + if (dma_map) + dma_unmap_sg(iue->target->dev, sg, nsg, DMA_BIDIRECTIONAL); + +free_mem: + if (token && dma_map) { + dma_free_coherent(iue->target->dev, + be32_to_cpu(id->table_desc.len), md, token); + } + return err; +} + +static int data_out_desc_size(struct srp_cmd *cmd) +{ + int size = 0; + u8 fmt = cmd->buf_fmt >> 4; + + switch (fmt) { + case SRP_NO_DATA_DESC: + break; + case SRP_DATA_DESC_DIRECT: + size = sizeof(struct srp_direct_buf); + break; + case SRP_DATA_DESC_INDIRECT: + size = sizeof(struct srp_indirect_buf) + + sizeof(struct srp_direct_buf) * cmd->data_out_desc_cnt; + break; + default: + pr_err("client error. Invalid data_out_format %x\n", fmt); + break; + } + return size; +} + +/* + * TODO: this can be called multiple times for a single command if it + * has very long data. + */ +int srp_transfer_data(struct ibmvscsis_cmd *cmd, struct srp_cmd *srp_cmd, + srp_rdma_t rdma_io, int dma_map, int ext_desc) +{ + struct srp_direct_buf *md; + struct srp_indirect_buf *id; + enum dma_data_direction dir; + int offset, err = 0; + u8 format; + + if (!cmd->se_cmd.t_data_nents) + return 0; + + offset = srp_cmd->add_cdb_len & ~3; + + dir = srp_cmd_direction(srp_cmd); + if (dir == DMA_FROM_DEVICE) + offset += data_out_desc_size(srp_cmd); + + if (dir == DMA_TO_DEVICE) + format = srp_cmd->buf_fmt >> 4; + else + format = srp_cmd->buf_fmt & ((1U << 4) - 1); + + switch (format) { + case SRP_NO_DATA_DESC: + break; + case SRP_DATA_DESC_DIRECT: + md = (struct srp_direct_buf *)(srp_cmd->add_data + offset); + err = srp_direct_data(cmd, md, dir, rdma_io, dma_map, ext_desc); + break; + case SRP_DATA_DESC_INDIRECT: + id = (struct srp_indirect_buf *)(srp_cmd->add_data + offset); + err = srp_indirect_data(cmd, srp_cmd, id, dir, rdma_io, dma_map, + ext_desc); + break; + default: + pr_err("Unknown format %d %x\n", dir, format); + err = -EINVAL; + } + + return err; +} + +u64 srp_data_length(struct srp_cmd *cmd, enum dma_data_direction dir) +{ + struct srp_direct_buf *md; + struct srp_indirect_buf *id; + u64 len = 0; + uint offset = cmd->add_cdb_len & ~3; + u8 fmt; + + if (dir == DMA_TO_DEVICE) { + fmt = cmd->buf_fmt >> 4; + } else { + fmt = cmd->buf_fmt & ((1U << 4) - 1); + offset += data_out_desc_size(cmd); + } + + switch (fmt) { + case SRP_NO_DATA_DESC: + break; + case SRP_DATA_DESC_DIRECT: + md = (struct srp_direct_buf *)(cmd->add_data + offset); + len = be32_to_cpu(md->len); + break; + case SRP_DATA_DESC_INDIRECT: + id = (struct srp_indirect_buf *)(cmd->add_data + offset); + len = be32_to_cpu(id->len); + break; + default: + pr_err("invalid data format %x\n", fmt); + break; + } + return len; +} + +int srp_get_desc_table(struct srp_cmd *srp_cmd, enum dma_data_direction *dir, + u64 *data_len) +{ + struct srp_indirect_buf *idb; + struct srp_direct_buf *db; + uint add_cdb_offset; + int rc; + + /* + * The pointer computations below will only be compiled correctly + * if srp_cmd::add_data is declared as s8*, u8*, s8[] or u8[], so check + * whether srp_cmd::add_data has been declared as a byte pointer. + */ + BUILD_BUG_ON(!__same_type(srp_cmd->add_data[0], (s8)0) + && !__same_type(srp_cmd->add_data[0], (u8)0)); + + BUG_ON(!dir); + BUG_ON(!data_len); + + rc = 0; + *data_len = 0; + + *dir = DMA_NONE; + + if (srp_cmd->buf_fmt & 0xf) + *dir = DMA_FROM_DEVICE; + else if (srp_cmd->buf_fmt >> 4) + *dir = DMA_TO_DEVICE; + + add_cdb_offset = srp_cmd->add_cdb_len & ~3; + if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_DIRECT) || + ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_DIRECT)) { + db = (struct srp_direct_buf *)(srp_cmd->add_data + + add_cdb_offset); + *data_len = be32_to_cpu(db->len); + } else if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_INDIRECT) || + ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_INDIRECT)) { + idb = (struct srp_indirect_buf *)(srp_cmd->add_data + + add_cdb_offset); + + *data_len = be32_to_cpu(idb->len); + } + return rc; +} + +MODULE_DESCRIPTION("SCSI RDMA Protocol lib functions"); +MODULE_AUTHOR("FUJITA Tomonori"); +MODULE_LICENSE("GPL"); diff --git a/drivers/scsi/ibmvscsi_tgt/libsrp.h b/drivers/scsi/ibmvscsi_tgt/libsrp.h new file mode 100644 index 000000000000..4696f331453e --- /dev/null +++ b/drivers/scsi/ibmvscsi_tgt/libsrp.h @@ -0,0 +1,123 @@ +#ifndef __LIBSRP_H__ +#define __LIBSRP_H__ + +#include <linux/list.h> +#include <linux/kfifo.h> +#include <scsi/srp.h> + +enum srp_valid { + INVALIDATE_CMD_RESP_EL = 0, + VALID_CMD_RESP_EL = 0x80, + VALID_INIT_MSG = 0xC0, + VALID_TRANS_EVENT = 0xFF +}; + +enum srp_format { + SRP_FORMAT = 1, + MAD_FORMAT = 2, + OS400_FORMAT = 3, + AIX_FORMAT = 4, + LINUX_FORMAT = 5, + MESSAGE_IN_CRQ = 6 +}; + +enum srp_init_msg { + INIT_MSG = 1, + INIT_COMPLETE_MSG = 2 +}; + +enum srp_trans_event { + UNUSED_FORMAT = 0, + PARTNER_FAILED = 1, + PARTNER_DEREGISTER = 2, + MIGRATED = 6 +}; + +enum srp_status { + HEADER_DESCRIPTOR = 0xF1, + PING = 0xF5, + PING_RESPONSE = 0xF6 +}; + +enum srp_mad_version { + MAD_VERSION_1 = 1 +}; + +enum srp_os_type { + OS400 = 1, + LINUX = 2, + AIX = 3, + OFW = 4 +}; + +enum srp_task_attributes { + SRP_SIMPLE_TASK = 0, + SRP_HEAD_TASK = 1, + SRP_ORDERED_TASK = 2, + SRP_ACA_TASK = 4 +}; + +enum { + SRP_TASK_MANAGEMENT_FUNCTION_COMPLETE = 0, + SRP_REQUEST_FIELDS_INVALID = 2, + SRP_TASK_MANAGEMENT_FUNCTION_NOT_SUPPORTED = 4, + SRP_TASK_MANAGEMENT_FUNCTION_FAILED = 5 +}; + +struct srp_buf { + dma_addr_t dma; + void *buf; +}; + +struct srp_queue { + void *pool; + void *items; + struct kfifo queue; + spinlock_t lock; +}; + +struct srp_target { + struct device *dev; + + spinlock_t lock; + struct list_head cmd_queue; + + size_t srp_iu_size; + struct srp_queue iu_queue; + size_t rx_ring_size; + struct srp_buf **rx_ring; + + void *ldata; +}; + +struct iu_entry { + struct srp_target *target; + + struct list_head ilist; + dma_addr_t remote_token; + unsigned long flags; + + struct srp_buf *sbuf; + u16 iu_len; +}; + +struct ibmvscsis_cmd; + +typedef int (srp_rdma_t)(struct ibmvscsis_cmd *, struct scatterlist *, int, + struct srp_direct_buf *, int, + enum dma_data_direction, unsigned int); +int srp_target_alloc(struct srp_target *, struct device *, size_t, size_t); +void srp_target_free(struct srp_target *); +struct iu_entry *srp_iu_get(struct srp_target *); +void srp_iu_put(struct iu_entry *); +int srp_transfer_data(struct ibmvscsis_cmd *, struct srp_cmd *, + srp_rdma_t, int, int); +u64 srp_data_length(struct srp_cmd *cmd, enum dma_data_direction dir); +int srp_get_desc_table(struct srp_cmd *srp_cmd, enum dma_data_direction *dir, + u64 *data_len); +static inline int srp_cmd_direction(struct srp_cmd *cmd) +{ + return (cmd->buf_fmt >> 4) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; +} + +#endif diff --git a/drivers/staging/emxx_udc/Kconfig b/drivers/staging/emxx_udc/Kconfig index cc3402020487..d7577096fb25 100644 --- a/drivers/staging/emxx_udc/Kconfig +++ b/drivers/staging/emxx_udc/Kconfig @@ -1,5 +1,5 @@ config USB_EMXX - bool "EMXX USB Function Device Controller" + tristate "EMXX USB Function Device Controller" depends on USB_GADGET && (ARCH_SHMOBILE || (ARM && COMPILE_TEST)) help The Emma Mobile series of SoCs from Renesas Electronics and diff --git a/drivers/staging/emxx_udc/emxx_udc.c b/drivers/staging/emxx_udc/emxx_udc.c index 3bd91758b2da..3b56b2826263 100644 --- a/drivers/staging/emxx_udc/emxx_udc.c +++ b/drivers/staging/emxx_udc/emxx_udc.c @@ -15,7 +15,7 @@ */ #include <linux/kernel.h> -#include <linux/init.h> +#include <linux/module.h> #include <linux/platform_device.h> #include <linux/delay.h> #include <linux/ioport.h> @@ -39,9 +39,11 @@ #include "emxx_udc.h" +#define DRIVER_DESC "EMXX UDC driver" #define DMA_ADDR_INVALID (~(dma_addr_t)0) static const char driver_name[] = "emxx_udc"; +static const char driver_desc[] = DRIVER_DESC; /*===========================================================================*/ /* Prototype */ @@ -3296,6 +3298,28 @@ static void nbu2ss_drv_shutdown(struct platform_device *pdev) } /*-------------------------------------------------------------------------*/ +static int nbu2ss_drv_remove(struct platform_device *pdev) +{ + struct nbu2ss_udc *udc; + struct nbu2ss_ep *ep; + int i; + + udc = &udc_controller; + + for (i = 0; i < NUM_ENDPOINTS; i++) { + ep = &udc->ep[i]; + if (ep->virt_buf) + dma_free_coherent(NULL, PAGE_SIZE, + (void *)ep->virt_buf, ep->phys_buf); + } + + /* Interrupt Handler - Release */ + free_irq(INT_VBUS, udc); + + return 0; +} + +/*-------------------------------------------------------------------------*/ static int nbu2ss_drv_suspend(struct platform_device *pdev, pm_message_t state) { struct nbu2ss_udc *udc; @@ -3347,12 +3371,16 @@ static int nbu2ss_drv_resume(struct platform_device *pdev) static struct platform_driver udc_driver = { .probe = nbu2ss_drv_probe, .shutdown = nbu2ss_drv_shutdown, + .remove = nbu2ss_drv_remove, .suspend = nbu2ss_drv_suspend, .resume = nbu2ss_drv_resume, .driver = { - .name = driver_name, - .suppress_bind_attrs = true, + .name = driver_name, }, }; -builtin_platform_driver(udc_driver); +module_platform_driver(udc_driver); + +MODULE_DESCRIPTION(DRIVER_DESC); +MODULE_AUTHOR("Renesas Electronics Corporation"); +MODULE_LICENSE("GPL"); diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 50f3d3a0dd7b..39b928c2849d 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -492,7 +492,8 @@ void iscsit_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd) bool scsi_cmd = (cmd->iscsi_opcode == ISCSI_OP_SCSI_CMD); spin_lock_bh(&conn->cmd_lock); - if (!list_empty(&cmd->i_conn_node)) + if (!list_empty(&cmd->i_conn_node) && + !(cmd->se_cmd.transport_state & CMD_T_FABRIC_STOP)) list_del_init(&cmd->i_conn_node); spin_unlock_bh(&conn->cmd_lock); @@ -4034,6 +4035,7 @@ int iscsi_target_rx_thread(void *arg) static void iscsit_release_commands_from_conn(struct iscsi_conn *conn) { + LIST_HEAD(tmp_list); struct iscsi_cmd *cmd = NULL, *cmd_tmp = NULL; struct iscsi_session *sess = conn->sess; /* @@ -4042,18 +4044,26 @@ static void iscsit_release_commands_from_conn(struct iscsi_conn *conn) * has been reset -> returned sleeping pre-handler state. */ spin_lock_bh(&conn->cmd_lock); - list_for_each_entry_safe(cmd, cmd_tmp, &conn->conn_cmd_list, i_conn_node) { + list_splice_init(&conn->conn_cmd_list, &tmp_list); + list_for_each_entry(cmd, &tmp_list, i_conn_node) { + struct se_cmd *se_cmd = &cmd->se_cmd; + + if (se_cmd->se_tfo != NULL) { + spin_lock(&se_cmd->t_state_lock); + se_cmd->transport_state |= CMD_T_FABRIC_STOP; + spin_unlock(&se_cmd->t_state_lock); + } + } + spin_unlock_bh(&conn->cmd_lock); + + list_for_each_entry_safe(cmd, cmd_tmp, &tmp_list, i_conn_node) { list_del_init(&cmd->i_conn_node); - spin_unlock_bh(&conn->cmd_lock); iscsit_increment_maxcmdsn(cmd, sess); - iscsit_free_cmd(cmd, true); - spin_lock_bh(&conn->cmd_lock); } - spin_unlock_bh(&conn->cmd_lock); } static void iscsit_stop_timers_for_cmds( diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index b5212f0f9571..adf419fa4291 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -1371,8 +1371,9 @@ static int __iscsi_target_login_thread(struct iscsi_np *np) } login->zero_tsih = zero_tsih; - conn->sess->se_sess->sup_prot_ops = - conn->conn_transport->iscsit_get_sup_prot_ops(conn); + if (conn->sess) + conn->sess->se_sess->sup_prot_ops = + conn->conn_transport->iscsit_get_sup_prot_ops(conn); tpg = conn->tpg; if (!tpg) { diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index a4046ca6e60d..6b423485c5d6 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -821,13 +821,15 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name) * in ATA and we need to set TPE=1 */ bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib, - struct request_queue *q, int block_size) + struct request_queue *q) { + int block_size = queue_logical_block_size(q); + if (!blk_queue_discard(q)) return false; - attrib->max_unmap_lba_count = (q->limits.max_discard_sectors << 9) / - block_size; + attrib->max_unmap_lba_count = + q->limits.max_discard_sectors >> (ilog2(block_size) - 9); /* * Currently hardcoded to 1 in Linux/SCSI code.. */ diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 75f0f08b2a34..d545993df18b 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -161,8 +161,7 @@ static int fd_configure_device(struct se_device *dev) dev_size, div_u64(dev_size, fd_dev->fd_block_size), fd_dev->fd_block_size); - if (target_configure_unmap_from_queue(&dev->dev_attrib, q, - fd_dev->fd_block_size)) + if (target_configure_unmap_from_queue(&dev->dev_attrib, q)) pr_debug("IFILE: BLOCK Discard support available," " disabled by default\n"); /* @@ -523,7 +522,7 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, */ if (cmd->data_length > FD_MAX_BYTES) { pr_err("FILEIO: Not able to process I/O of %u bytes due to" - "FD_MAX_BYTES: %u iovec count limitiation\n", + "FD_MAX_BYTES: %u iovec count limitation\n", cmd->data_length, FD_MAX_BYTES); return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; } diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 22af12f8b8eb..47cf6c977367 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -121,8 +121,7 @@ static int iblock_configure_device(struct se_device *dev) dev->dev_attrib.hw_max_sectors = queue_max_hw_sectors(q); dev->dev_attrib.hw_queue_depth = q->nr_requests; - if (target_configure_unmap_from_queue(&dev->dev_attrib, q, - dev->dev_attrib.hw_block_size)) + if (target_configure_unmap_from_queue(&dev->dev_attrib, q)) pr_debug("IBLOCK: BLOCK Discard support available," " disabled by default\n"); diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h index fc91e85f54ba..e2c970a9d61c 100644 --- a/drivers/target/target_core_internal.h +++ b/drivers/target/target_core_internal.h @@ -146,6 +146,7 @@ sense_reason_t target_cmd_size_check(struct se_cmd *cmd, unsigned int size); void target_qf_do_work(struct work_struct *work); bool target_check_wce(struct se_device *dev); bool target_check_fua(struct se_device *dev); +void __target_execute_cmd(struct se_cmd *, bool); /* target_core_stat.c */ void target_stat_setup_dev_default_groups(struct se_device *); diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c index a9057aa07176..04f616b3ba0a 100644 --- a/drivers/target/target_core_sbc.c +++ b/drivers/target/target_core_sbc.c @@ -602,7 +602,7 @@ static sense_reason_t compare_and_write_callback(struct se_cmd *cmd, bool succes cmd->transport_state |= CMD_T_ACTIVE|CMD_T_BUSY|CMD_T_SENT; spin_unlock_irq(&cmd->t_state_lock); - __target_execute_cmd(cmd); + __target_execute_cmd(cmd, false); kfree(buf); return ret; diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 5ab3967dda43..6094a6beddde 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -754,7 +754,15 @@ EXPORT_SYMBOL(target_complete_cmd); void target_complete_cmd_with_length(struct se_cmd *cmd, u8 scsi_status, int length) { - if (scsi_status == SAM_STAT_GOOD && length < cmd->data_length) { + if (scsi_status != SAM_STAT_GOOD) { + return; + } + + /* + * Calculate new residual count based upon length of SCSI data + * transferred. + */ + if (length < cmd->data_length) { if (cmd->se_cmd_flags & SCF_UNDERFLOW_BIT) { cmd->residual_count += cmd->data_length - length; } else { @@ -763,6 +771,12 @@ void target_complete_cmd_with_length(struct se_cmd *cmd, u8 scsi_status, int len } cmd->data_length = length; + } else if (length > cmd->data_length) { + cmd->se_cmd_flags |= SCF_OVERFLOW_BIT; + cmd->residual_count = length - cmd->data_length; + } else { + cmd->se_cmd_flags &= ~(SCF_OVERFLOW_BIT | SCF_UNDERFLOW_BIT); + cmd->residual_count = 0; } target_complete_cmd(cmd, scsi_status); @@ -1303,23 +1317,6 @@ target_setup_cmd_from_cdb(struct se_cmd *cmd, unsigned char *cdb) trace_target_sequencer_start(cmd); - /* - * Check for an existing UNIT ATTENTION condition - */ - ret = target_scsi3_ua_check(cmd); - if (ret) - return ret; - - ret = target_alua_state_check(cmd); - if (ret) - return ret; - - ret = target_check_reservation(cmd); - if (ret) { - cmd->scsi_status = SAM_STAT_RESERVATION_CONFLICT; - return ret; - } - ret = dev->transport->parse_cdb(cmd); if (ret == TCM_UNSUPPORTED_SCSI_OPCODE) pr_warn_ratelimited("%s/%s: Unsupported SCSI Opcode 0x%02x, sending CHECK_CONDITION.\n", @@ -1761,20 +1758,45 @@ queue_full: } EXPORT_SYMBOL(transport_generic_request_failure); -void __target_execute_cmd(struct se_cmd *cmd) +void __target_execute_cmd(struct se_cmd *cmd, bool do_checks) { sense_reason_t ret; - if (cmd->execute_cmd) { - ret = cmd->execute_cmd(cmd); - if (ret) { - spin_lock_irq(&cmd->t_state_lock); - cmd->transport_state &= ~(CMD_T_BUSY|CMD_T_SENT); - spin_unlock_irq(&cmd->t_state_lock); + if (!cmd->execute_cmd) { + ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + goto err; + } + if (do_checks) { + /* + * Check for an existing UNIT ATTENTION condition after + * target_handle_task_attr() has done SAM task attr + * checking, and possibly have already defered execution + * out to target_restart_delayed_cmds() context. + */ + ret = target_scsi3_ua_check(cmd); + if (ret) + goto err; - transport_generic_request_failure(cmd, ret); + ret = target_alua_state_check(cmd); + if (ret) + goto err; + + ret = target_check_reservation(cmd); + if (ret) { + cmd->scsi_status = SAM_STAT_RESERVATION_CONFLICT; + goto err; } } + + ret = cmd->execute_cmd(cmd); + if (!ret) + return; +err: + spin_lock_irq(&cmd->t_state_lock); + cmd->transport_state &= ~(CMD_T_BUSY|CMD_T_SENT); + spin_unlock_irq(&cmd->t_state_lock); + + transport_generic_request_failure(cmd, ret); } static int target_write_prot_action(struct se_cmd *cmd) @@ -1819,6 +1841,8 @@ static bool target_handle_task_attr(struct se_cmd *cmd) if (dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH) return false; + cmd->se_cmd_flags |= SCF_TASK_ATTR_SET; + /* * Check for the existence of HEAD_OF_QUEUE, and if true return 1 * to allow the passed struct se_cmd list of tasks to the front of the list. @@ -1899,7 +1923,7 @@ void target_execute_cmd(struct se_cmd *cmd) return; } - __target_execute_cmd(cmd); + __target_execute_cmd(cmd, true); } EXPORT_SYMBOL(target_execute_cmd); @@ -1923,7 +1947,7 @@ static void target_restart_delayed_cmds(struct se_device *dev) list_del(&cmd->se_delayed_node); spin_unlock(&dev->delayed_cmd_lock); - __target_execute_cmd(cmd); + __target_execute_cmd(cmd, true); if (cmd->sam_task_attr == TCM_ORDERED_TAG) break; @@ -1941,6 +1965,9 @@ static void transport_complete_task_attr(struct se_cmd *cmd) if (dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH) return; + if (!(cmd->se_cmd_flags & SCF_TASK_ATTR_SET)) + goto restart; + if (cmd->sam_task_attr == TCM_SIMPLE_TAG) { atomic_dec_mb(&dev->simple_cmds); dev->dev_cur_ordered_id++; @@ -1957,7 +1984,7 @@ static void transport_complete_task_attr(struct se_cmd *cmd) pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED\n", dev->dev_cur_ordered_id); } - +restart: target_restart_delayed_cmds(dev); } @@ -2557,15 +2584,10 @@ static void target_release_cmd_kref(struct kref *kref) bool fabric_stop; spin_lock_irqsave(&se_sess->sess_cmd_lock, flags); - if (list_empty(&se_cmd->se_cmd_list)) { - spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags); - target_free_cmd_mem(se_cmd); - se_cmd->se_tfo->release_cmd(se_cmd); - return; - } spin_lock(&se_cmd->t_state_lock); - fabric_stop = (se_cmd->transport_state & CMD_T_FABRIC_STOP); + fabric_stop = (se_cmd->transport_state & CMD_T_FABRIC_STOP) && + (se_cmd->transport_state & CMD_T_ABORTED); spin_unlock(&se_cmd->t_state_lock); if (se_cmd->cmd_wait_set || fabric_stop) { diff --git a/drivers/target/tcm_fc/tfc_sess.c b/drivers/target/tcm_fc/tfc_sess.c index f5186a744399..6ffbb603d912 100644 --- a/drivers/target/tcm_fc/tfc_sess.c +++ b/drivers/target/tcm_fc/tfc_sess.c @@ -91,6 +91,7 @@ static void ft_tport_delete(struct ft_tport *tport) ft_sess_delete_all(tport); lport = tport->lport; + lport->service_params &= ~FCP_SPPF_TARG_FCN; BUG_ON(tport != lport->prov[FC_TYPE_FCP]); RCU_INIT_POINTER(lport->prov[FC_TYPE_FCP], NULL); @@ -110,6 +111,7 @@ void ft_lport_add(struct fc_lport *lport, void *arg) { mutex_lock(&ft_lport_lock); ft_tport_get(lport); + lport->service_params |= FCP_SPPF_TARG_FCN; mutex_unlock(&ft_lport_lock); } diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c index 96a70789b4c2..4d6a5c672a3d 100644 --- a/drivers/usb/serial/cp210x.c +++ b/drivers/usb/serial/cp210x.c @@ -496,12 +496,10 @@ static int cp210x_write_reg_block(struct usb_serial_port *port, u8 req, void *dmabuf; int result; - dmabuf = kmalloc(bufsize, GFP_KERNEL); + dmabuf = kmemdup(buf, bufsize, GFP_KERNEL); if (!dmabuf) return -ENOMEM; - memcpy(dmabuf, buf, bufsize); - result = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), req, REQTYPE_HOST_TO_INTERFACE, 0, port_priv->bInterfaceNumber, dmabuf, bufsize, diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c index ae8c0365abd6..944de657a07a 100644 --- a/drivers/usb/serial/generic.c +++ b/drivers/usb/serial/generic.c @@ -350,6 +350,7 @@ void usb_serial_generic_read_bulk_callback(struct urb *urb) struct usb_serial_port *port = urb->context; unsigned char *data = urb->transfer_buffer; unsigned long flags; + int status = urb->status; int i; for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) { @@ -360,22 +361,22 @@ void usb_serial_generic_read_bulk_callback(struct urb *urb) dev_dbg(&port->dev, "%s - urb %d, len %d\n", __func__, i, urb->actual_length); - switch (urb->status) { + switch (status) { case 0: break; case -ENOENT: case -ECONNRESET: case -ESHUTDOWN: dev_dbg(&port->dev, "%s - urb stopped: %d\n", - __func__, urb->status); + __func__, status); return; case -EPIPE: dev_err(&port->dev, "%s - urb stopped: %d\n", - __func__, urb->status); + __func__, status); return; default: dev_dbg(&port->dev, "%s - nonzero urb status: %d\n", - __func__, urb->status); + __func__, status); goto resubmit; } @@ -399,6 +400,7 @@ void usb_serial_generic_write_bulk_callback(struct urb *urb) { unsigned long flags; struct usb_serial_port *port = urb->context; + int status = urb->status; int i; for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) { @@ -410,22 +412,22 @@ void usb_serial_generic_write_bulk_callback(struct urb *urb) set_bit(i, &port->write_urbs_free); spin_unlock_irqrestore(&port->lock, flags); - switch (urb->status) { + switch (status) { case 0: break; case -ENOENT: case -ECONNRESET: case -ESHUTDOWN: dev_dbg(&port->dev, "%s - urb stopped: %d\n", - __func__, urb->status); + __func__, status); return; case -EPIPE: dev_err_console(port, "%s - urb stopped: %d\n", - __func__, urb->status); + __func__, status); return; default: dev_err_console(port, "%s - nonzero urb status: %d\n", - __func__, urb->status); + __func__, status); goto resubmit; } diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index d96d423d00e6..8e07536c233a 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -273,6 +273,7 @@ static void option_instat_callback(struct urb *urb); #define TELIT_PRODUCT_LE922_USBCFG5 0x1045 #define TELIT_PRODUCT_LE920 0x1200 #define TELIT_PRODUCT_LE910 0x1201 +#define TELIT_PRODUCT_LE910_USBCFG4 0x1206 /* ZTE PRODUCTS */ #define ZTE_VENDOR_ID 0x19d2 @@ -1198,6 +1199,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg0 }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910), .driver_info = (kernel_ulong_t)&telit_le910_blacklist }, + { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4), + .driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg3 }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920), .driver_info = (kernel_ulong_t)&telit_le920_blacklist }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MF622, 0xff, 0xff, 0xff) }, /* ZTE WCDMA products */ diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c index e7dbbef2af2a..07b4bf01061d 100644 --- a/drivers/usb/serial/ti_usb_3410_5052.c +++ b/drivers/usb/serial/ti_usb_3410_5052.c @@ -1,5 +1,4 @@ -/* vi: ts=8 sw=8 - * +/* * TI 3410/5052 USB Serial Driver * * Copyright (C) 2004 Texas Instruments @@ -35,9 +34,238 @@ #include <linux/usb.h> #include <linux/usb/serial.h> -#include "ti_usb_3410_5052.h" - -/* Defines */ +/* Configuration ids */ +#define TI_BOOT_CONFIG 1 +#define TI_ACTIVE_CONFIG 2 + +/* Vendor and product ids */ +#define TI_VENDOR_ID 0x0451 +#define IBM_VENDOR_ID 0x04b3 +#define TI_3410_PRODUCT_ID 0x3410 +#define IBM_4543_PRODUCT_ID 0x4543 +#define IBM_454B_PRODUCT_ID 0x454b +#define IBM_454C_PRODUCT_ID 0x454c +#define TI_3410_EZ430_ID 0xF430 /* TI ez430 development tool */ +#define TI_5052_BOOT_PRODUCT_ID 0x5052 /* no EEPROM, no firmware */ +#define TI_5152_BOOT_PRODUCT_ID 0x5152 /* no EEPROM, no firmware */ +#define TI_5052_EEPROM_PRODUCT_ID 0x505A /* EEPROM, no firmware */ +#define TI_5052_FIRMWARE_PRODUCT_ID 0x505F /* firmware is running */ +#define FRI2_PRODUCT_ID 0x5053 /* Fish River Island II */ + +/* Multi-Tech vendor and product ids */ +#define MTS_VENDOR_ID 0x06E0 +#define MTS_GSM_NO_FW_PRODUCT_ID 0xF108 +#define MTS_CDMA_NO_FW_PRODUCT_ID 0xF109 +#define MTS_CDMA_PRODUCT_ID 0xF110 +#define MTS_GSM_PRODUCT_ID 0xF111 +#define MTS_EDGE_PRODUCT_ID 0xF112 +#define MTS_MT9234MU_PRODUCT_ID 0xF114 +#define MTS_MT9234ZBA_PRODUCT_ID 0xF115 +#define MTS_MT9234ZBAOLD_PRODUCT_ID 0x0319 + +/* Abbott Diabetics vendor and product ids */ +#define ABBOTT_VENDOR_ID 0x1a61 +#define ABBOTT_STEREO_PLUG_ID 0x3410 +#define ABBOTT_PRODUCT_ID ABBOTT_STEREO_PLUG_ID +#define ABBOTT_STRIP_PORT_ID 0x3420 + +/* Honeywell vendor and product IDs */ +#define HONEYWELL_VENDOR_ID 0x10ac +#define HONEYWELL_HGI80_PRODUCT_ID 0x0102 /* Honeywell HGI80 */ + +/* Moxa UPORT 11x0 vendor and product IDs */ +#define MXU1_VENDOR_ID 0x110a +#define MXU1_1110_PRODUCT_ID 0x1110 +#define MXU1_1130_PRODUCT_ID 0x1130 +#define MXU1_1150_PRODUCT_ID 0x1150 +#define MXU1_1151_PRODUCT_ID 0x1151 +#define MXU1_1131_PRODUCT_ID 0x1131 + +/* Commands */ +#define TI_GET_VERSION 0x01 +#define TI_GET_PORT_STATUS 0x02 +#define TI_GET_PORT_DEV_INFO 0x03 +#define TI_GET_CONFIG 0x04 +#define TI_SET_CONFIG 0x05 +#define TI_OPEN_PORT 0x06 +#define TI_CLOSE_PORT 0x07 +#define TI_START_PORT 0x08 +#define TI_STOP_PORT 0x09 +#define TI_TEST_PORT 0x0A +#define TI_PURGE_PORT 0x0B +#define TI_RESET_EXT_DEVICE 0x0C +#define TI_WRITE_DATA 0x80 +#define TI_READ_DATA 0x81 +#define TI_REQ_TYPE_CLASS 0x82 + +/* Module identifiers */ +#define TI_I2C_PORT 0x01 +#define TI_IEEE1284_PORT 0x02 +#define TI_UART1_PORT 0x03 +#define TI_UART2_PORT 0x04 +#define TI_RAM_PORT 0x05 + +/* Modem status */ +#define TI_MSR_DELTA_CTS 0x01 +#define TI_MSR_DELTA_DSR 0x02 +#define TI_MSR_DELTA_RI 0x04 +#define TI_MSR_DELTA_CD 0x08 +#define TI_MSR_CTS 0x10 +#define TI_MSR_DSR 0x20 +#define TI_MSR_RI 0x40 +#define TI_MSR_CD 0x80 +#define TI_MSR_DELTA_MASK 0x0F +#define TI_MSR_MASK 0xF0 + +/* Line status */ +#define TI_LSR_OVERRUN_ERROR 0x01 +#define TI_LSR_PARITY_ERROR 0x02 +#define TI_LSR_FRAMING_ERROR 0x04 +#define TI_LSR_BREAK 0x08 +#define TI_LSR_ERROR 0x0F +#define TI_LSR_RX_FULL 0x10 +#define TI_LSR_TX_EMPTY 0x20 + +/* Line control */ +#define TI_LCR_BREAK 0x40 + +/* Modem control */ +#define TI_MCR_LOOP 0x04 +#define TI_MCR_DTR 0x10 +#define TI_MCR_RTS 0x20 + +/* Mask settings */ +#define TI_UART_ENABLE_RTS_IN 0x0001 +#define TI_UART_DISABLE_RTS 0x0002 +#define TI_UART_ENABLE_PARITY_CHECKING 0x0008 +#define TI_UART_ENABLE_DSR_OUT 0x0010 +#define TI_UART_ENABLE_CTS_OUT 0x0020 +#define TI_UART_ENABLE_X_OUT 0x0040 +#define TI_UART_ENABLE_XA_OUT 0x0080 +#define TI_UART_ENABLE_X_IN 0x0100 +#define TI_UART_ENABLE_DTR_IN 0x0800 +#define TI_UART_DISABLE_DTR 0x1000 +#define TI_UART_ENABLE_MS_INTS 0x2000 +#define TI_UART_ENABLE_AUTO_START_DMA 0x4000 + +/* Parity */ +#define TI_UART_NO_PARITY 0x00 +#define TI_UART_ODD_PARITY 0x01 +#define TI_UART_EVEN_PARITY 0x02 +#define TI_UART_MARK_PARITY 0x03 +#define TI_UART_SPACE_PARITY 0x04 + +/* Stop bits */ +#define TI_UART_1_STOP_BITS 0x00 +#define TI_UART_1_5_STOP_BITS 0x01 +#define TI_UART_2_STOP_BITS 0x02 + +/* Bits per character */ +#define TI_UART_5_DATA_BITS 0x00 +#define TI_UART_6_DATA_BITS 0x01 +#define TI_UART_7_DATA_BITS 0x02 +#define TI_UART_8_DATA_BITS 0x03 + +/* 232/485 modes */ +#define TI_UART_232 0x00 +#define TI_UART_485_RECEIVER_DISABLED 0x01 +#define TI_UART_485_RECEIVER_ENABLED 0x02 + +/* Pipe transfer mode and timeout */ +#define TI_PIPE_MODE_CONTINUOUS 0x01 +#define TI_PIPE_MODE_MASK 0x03 +#define TI_PIPE_TIMEOUT_MASK 0x7C +#define TI_PIPE_TIMEOUT_ENABLE 0x80 + +/* Config struct */ +struct ti_uart_config { + __u16 wBaudRate; + __u16 wFlags; + __u8 bDataBits; + __u8 bParity; + __u8 bStopBits; + char cXon; + char cXoff; + __u8 bUartMode; +} __packed; + +/* Get port status */ +struct ti_port_status { + __u8 bCmdCode; + __u8 bModuleId; + __u8 bErrorCode; + __u8 bMSR; + __u8 bLSR; +} __packed; + +/* Purge modes */ +#define TI_PURGE_OUTPUT 0x00 +#define TI_PURGE_INPUT 0x80 + +/* Read/Write data */ +#define TI_RW_DATA_ADDR_SFR 0x10 +#define TI_RW_DATA_ADDR_IDATA 0x20 +#define TI_RW_DATA_ADDR_XDATA 0x30 +#define TI_RW_DATA_ADDR_CODE 0x40 +#define TI_RW_DATA_ADDR_GPIO 0x50 +#define TI_RW_DATA_ADDR_I2C 0x60 +#define TI_RW_DATA_ADDR_FLASH 0x70 +#define TI_RW_DATA_ADDR_DSP 0x80 + +#define TI_RW_DATA_UNSPECIFIED 0x00 +#define TI_RW_DATA_BYTE 0x01 +#define TI_RW_DATA_WORD 0x02 +#define TI_RW_DATA_DOUBLE_WORD 0x04 + +struct ti_write_data_bytes { + __u8 bAddrType; + __u8 bDataType; + __u8 bDataCounter; + __be16 wBaseAddrHi; + __be16 wBaseAddrLo; + __u8 bData[0]; +} __packed; + +struct ti_read_data_request { + __u8 bAddrType; + __u8 bDataType; + __u8 bDataCounter; + __be16 wBaseAddrHi; + __be16 wBaseAddrLo; +} __packed; + +struct ti_read_data_bytes { + __u8 bCmdCode; + __u8 bModuleId; + __u8 bErrorCode; + __u8 bData[0]; +} __packed; + +/* Interrupt struct */ +struct ti_interrupt { + __u8 bICode; + __u8 bIInfo; +} __packed; + +/* Interrupt codes */ +#define TI_CODE_HARDWARE_ERROR 0xFF +#define TI_CODE_DATA_ERROR 0x03 +#define TI_CODE_MODEM_STATUS 0x04 + +/* Download firmware max packet size */ +#define TI_DOWNLOAD_MAX_PACKET_SIZE 64 + +/* Firmware image header */ +struct ti_firmware_header { + __le16 wLength; + __u8 bCheckSum; +} __packed; + +/* UART addresses */ +#define TI_UART1_BASE_ADDR 0xFFA0 /* UART 1 base address */ +#define TI_UART2_BASE_ADDR 0xFFB0 /* UART 2 base address */ +#define TI_UART_OFFSET_LCR 0x0002 /* UART MCR register offset */ +#define TI_UART_OFFSET_MCR 0x0004 /* UART MCR register offset */ #define TI_DRIVER_AUTHOR "Al Borchers <alborchers@steinerpoint.com>" #define TI_DRIVER_DESC "TI USB 3410/5052 Serial Driver" @@ -58,9 +286,6 @@ #define TI_EXTRA_VID_PID_COUNT 5 - -/* Structures */ - struct ti_port { int tp_is_open; __u8 tp_msr; @@ -84,9 +309,6 @@ struct ti_device { int td_urb_error; }; - -/* Function Declarations */ - static int ti_startup(struct usb_serial *serial); static void ti_release(struct usb_serial *serial); static int ti_port_probe(struct usb_serial_port *port); @@ -136,13 +358,8 @@ static int ti_write_byte(struct usb_serial_port *port, struct ti_device *tdev, static int ti_download_firmware(struct ti_device *tdev); - -/* Data */ - -/* module parameters */ static int closing_wait = TI_DEFAULT_CLOSING_WAIT; -/* supported devices */ static const struct usb_device_id ti_id_table_3410[] = { { USB_DEVICE(TI_VENDOR_ID, TI_3410_PRODUCT_ID) }, { USB_DEVICE(TI_VENDOR_ID, TI_3410_EZ430_ID) }, @@ -174,7 +391,7 @@ static const struct usb_device_id ti_id_table_5052[] = { { USB_DEVICE(TI_VENDOR_ID, TI_5152_BOOT_PRODUCT_ID) }, { USB_DEVICE(TI_VENDOR_ID, TI_5052_EEPROM_PRODUCT_ID) }, { USB_DEVICE(TI_VENDOR_ID, TI_5052_FIRMWARE_PRODUCT_ID) }, - { } /* terminator */ + { } }; static const struct usb_device_id ti_id_table_combined[] = { @@ -275,8 +492,6 @@ static struct usb_serial_driver * const serial_drivers[] = { &ti_1port_device, &ti_2port_device, NULL }; -/* Module */ - MODULE_AUTHOR(TI_DRIVER_AUTHOR); MODULE_DESCRIPTION(TI_DRIVER_DESC); MODULE_LICENSE("GPL"); @@ -302,8 +517,6 @@ MODULE_DEVICE_TABLE(usb, ti_id_table_combined); module_usb_serial_driver(serial_drivers, ti_id_table_combined); -/* Functions */ - static int ti_startup(struct usb_serial *serial) { struct ti_device *tdev; @@ -319,7 +532,6 @@ static int ti_startup(struct usb_serial *serial) dev->descriptor.bNumConfigurations, dev->actconfig->desc.bConfigurationValue); - /* create device structure */ tdev = kzalloc(sizeof(struct ti_device), GFP_KERNEL); if (!tdev) return -ENOMEM; @@ -435,7 +647,7 @@ static int ti_open(struct tty_struct *tty, struct usb_serial_port *port) struct urb *urb; int port_number; int status; - __u16 open_settings = (__u8)(TI_PIPE_MODE_CONTINOUS | + __u16 open_settings = (__u8)(TI_PIPE_MODE_CONTINUOUS | TI_PIPE_TIMEOUT_ENABLE | (TI_TRANSFER_TIMEOUT << 2)); @@ -954,6 +1166,15 @@ static void ti_break(struct tty_struct *tty, int break_state) dev_dbg(&port->dev, "%s - error setting break, %d\n", __func__, status); } +static int ti_get_port_from_code(unsigned char code) +{ + return (code >> 4) - 3; +} + +static int ti_get_func_from_code(unsigned char code) +{ + return code & 0x0f; +} static void ti_interrupt_callback(struct urb *urb) { @@ -995,8 +1216,8 @@ static void ti_interrupt_callback(struct urb *urb) goto exit; } - port_number = TI_GET_PORT_FROM_CODE(data[0]); - function = TI_GET_FUNC_FROM_CODE(data[0]); + port_number = ti_get_port_from_code(data[0]); + function = ti_get_func_from_code(data[0]); dev_dbg(dev, "%s - port_number %d, function %d, data 0x%02X\n", __func__, port_number, function, data[1]); diff --git a/drivers/usb/serial/ti_usb_3410_5052.h b/drivers/usb/serial/ti_usb_3410_5052.h deleted file mode 100644 index bbfd3a184600..000000000000 --- a/drivers/usb/serial/ti_usb_3410_5052.h +++ /dev/null @@ -1,259 +0,0 @@ -/* vi: ts=8 sw=8 - * - * TI 3410/5052 USB Serial Driver Header - * - * Copyright (C) 2004 Texas Instruments - * - * This driver is based on the Linux io_ti driver, which is - * Copyright (C) 2000-2002 Inside Out Networks - * Copyright (C) 2001-2002 Greg Kroah-Hartman - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * For questions or problems with this driver, contact Texas Instruments - * technical support, or Al Borchers <alborchers@steinerpoint.com>, or - * Peter Berger <pberger@brimson.com>. - */ - -#ifndef _TI_3410_5052_H_ -#define _TI_3410_5052_H_ - -/* Configuration ids */ -#define TI_BOOT_CONFIG 1 -#define TI_ACTIVE_CONFIG 2 - -/* Vendor and product ids */ -#define TI_VENDOR_ID 0x0451 -#define IBM_VENDOR_ID 0x04b3 -#define TI_3410_PRODUCT_ID 0x3410 -#define IBM_4543_PRODUCT_ID 0x4543 -#define IBM_454B_PRODUCT_ID 0x454b -#define IBM_454C_PRODUCT_ID 0x454c -#define TI_3410_EZ430_ID 0xF430 /* TI ez430 development tool */ -#define TI_5052_BOOT_PRODUCT_ID 0x5052 /* no EEPROM, no firmware */ -#define TI_5152_BOOT_PRODUCT_ID 0x5152 /* no EEPROM, no firmware */ -#define TI_5052_EEPROM_PRODUCT_ID 0x505A /* EEPROM, no firmware */ -#define TI_5052_FIRMWARE_PRODUCT_ID 0x505F /* firmware is running */ -#define FRI2_PRODUCT_ID 0x5053 /* Fish River Island II */ - -/* Multi-Tech vendor and product ids */ -#define MTS_VENDOR_ID 0x06E0 -#define MTS_GSM_NO_FW_PRODUCT_ID 0xF108 -#define MTS_CDMA_NO_FW_PRODUCT_ID 0xF109 -#define MTS_CDMA_PRODUCT_ID 0xF110 -#define MTS_GSM_PRODUCT_ID 0xF111 -#define MTS_EDGE_PRODUCT_ID 0xF112 -#define MTS_MT9234MU_PRODUCT_ID 0xF114 -#define MTS_MT9234ZBA_PRODUCT_ID 0xF115 -#define MTS_MT9234ZBAOLD_PRODUCT_ID 0x0319 - -/* Abbott Diabetics vendor and product ids */ -#define ABBOTT_VENDOR_ID 0x1a61 -#define ABBOTT_STEREO_PLUG_ID 0x3410 -#define ABBOTT_PRODUCT_ID ABBOTT_STEREO_PLUG_ID -#define ABBOTT_STRIP_PORT_ID 0x3420 - -/* Honeywell vendor and product IDs */ -#define HONEYWELL_VENDOR_ID 0x10ac -#define HONEYWELL_HGI80_PRODUCT_ID 0x0102 /* Honeywell HGI80 */ - -/* Moxa UPORT 11x0 vendor and product IDs */ -#define MXU1_VENDOR_ID 0x110a -#define MXU1_1110_PRODUCT_ID 0x1110 -#define MXU1_1130_PRODUCT_ID 0x1130 -#define MXU1_1131_PRODUCT_ID 0x1131 -#define MXU1_1150_PRODUCT_ID 0x1150 -#define MXU1_1151_PRODUCT_ID 0x1151 - -/* Commands */ -#define TI_GET_VERSION 0x01 -#define TI_GET_PORT_STATUS 0x02 -#define TI_GET_PORT_DEV_INFO 0x03 -#define TI_GET_CONFIG 0x04 -#define TI_SET_CONFIG 0x05 -#define TI_OPEN_PORT 0x06 -#define TI_CLOSE_PORT 0x07 -#define TI_START_PORT 0x08 -#define TI_STOP_PORT 0x09 -#define TI_TEST_PORT 0x0A -#define TI_PURGE_PORT 0x0B -#define TI_RESET_EXT_DEVICE 0x0C -#define TI_WRITE_DATA 0x80 -#define TI_READ_DATA 0x81 -#define TI_REQ_TYPE_CLASS 0x82 - -/* Module identifiers */ -#define TI_I2C_PORT 0x01 -#define TI_IEEE1284_PORT 0x02 -#define TI_UART1_PORT 0x03 -#define TI_UART2_PORT 0x04 -#define TI_RAM_PORT 0x05 - -/* Modem status */ -#define TI_MSR_DELTA_CTS 0x01 -#define TI_MSR_DELTA_DSR 0x02 -#define TI_MSR_DELTA_RI 0x04 -#define TI_MSR_DELTA_CD 0x08 -#define TI_MSR_CTS 0x10 -#define TI_MSR_DSR 0x20 -#define TI_MSR_RI 0x40 -#define TI_MSR_CD 0x80 -#define TI_MSR_DELTA_MASK 0x0F -#define TI_MSR_MASK 0xF0 - -/* Line status */ -#define TI_LSR_OVERRUN_ERROR 0x01 -#define TI_LSR_PARITY_ERROR 0x02 -#define TI_LSR_FRAMING_ERROR 0x04 -#define TI_LSR_BREAK 0x08 -#define TI_LSR_ERROR 0x0F -#define TI_LSR_RX_FULL 0x10 -#define TI_LSR_TX_EMPTY 0x20 - -/* Line control */ -#define TI_LCR_BREAK 0x40 - -/* Modem control */ -#define TI_MCR_LOOP 0x04 -#define TI_MCR_DTR 0x10 -#define TI_MCR_RTS 0x20 - -/* Mask settings */ -#define TI_UART_ENABLE_RTS_IN 0x0001 -#define TI_UART_DISABLE_RTS 0x0002 -#define TI_UART_ENABLE_PARITY_CHECKING 0x0008 -#define TI_UART_ENABLE_DSR_OUT 0x0010 -#define TI_UART_ENABLE_CTS_OUT 0x0020 -#define TI_UART_ENABLE_X_OUT 0x0040 -#define TI_UART_ENABLE_XA_OUT 0x0080 -#define TI_UART_ENABLE_X_IN 0x0100 -#define TI_UART_ENABLE_DTR_IN 0x0800 -#define TI_UART_DISABLE_DTR 0x1000 -#define TI_UART_ENABLE_MS_INTS 0x2000 -#define TI_UART_ENABLE_AUTO_START_DMA 0x4000 - -/* Parity */ -#define TI_UART_NO_PARITY 0x00 -#define TI_UART_ODD_PARITY 0x01 -#define TI_UART_EVEN_PARITY 0x02 -#define TI_UART_MARK_PARITY 0x03 -#define TI_UART_SPACE_PARITY 0x04 - -/* Stop bits */ -#define TI_UART_1_STOP_BITS 0x00 -#define TI_UART_1_5_STOP_BITS 0x01 -#define TI_UART_2_STOP_BITS 0x02 - -/* Bits per character */ -#define TI_UART_5_DATA_BITS 0x00 -#define TI_UART_6_DATA_BITS 0x01 -#define TI_UART_7_DATA_BITS 0x02 -#define TI_UART_8_DATA_BITS 0x03 - -/* 232/485 modes */ -#define TI_UART_232 0x00 -#define TI_UART_485_RECEIVER_DISABLED 0x01 -#define TI_UART_485_RECEIVER_ENABLED 0x02 - -/* Pipe transfer mode and timeout */ -#define TI_PIPE_MODE_CONTINOUS 0x01 -#define TI_PIPE_MODE_MASK 0x03 -#define TI_PIPE_TIMEOUT_MASK 0x7C -#define TI_PIPE_TIMEOUT_ENABLE 0x80 - -/* Config struct */ -struct ti_uart_config { - __u16 wBaudRate; - __u16 wFlags; - __u8 bDataBits; - __u8 bParity; - __u8 bStopBits; - char cXon; - char cXoff; - __u8 bUartMode; -} __attribute__((packed)); - -/* Get port status */ -struct ti_port_status { - __u8 bCmdCode; - __u8 bModuleId; - __u8 bErrorCode; - __u8 bMSR; - __u8 bLSR; -} __attribute__((packed)); - -/* Purge modes */ -#define TI_PURGE_OUTPUT 0x00 -#define TI_PURGE_INPUT 0x80 - -/* Read/Write data */ -#define TI_RW_DATA_ADDR_SFR 0x10 -#define TI_RW_DATA_ADDR_IDATA 0x20 -#define TI_RW_DATA_ADDR_XDATA 0x30 -#define TI_RW_DATA_ADDR_CODE 0x40 -#define TI_RW_DATA_ADDR_GPIO 0x50 -#define TI_RW_DATA_ADDR_I2C 0x60 -#define TI_RW_DATA_ADDR_FLASH 0x70 -#define TI_RW_DATA_ADDR_DSP 0x80 - -#define TI_RW_DATA_UNSPECIFIED 0x00 -#define TI_RW_DATA_BYTE 0x01 -#define TI_RW_DATA_WORD 0x02 -#define TI_RW_DATA_DOUBLE_WORD 0x04 - -struct ti_write_data_bytes { - __u8 bAddrType; - __u8 bDataType; - __u8 bDataCounter; - __be16 wBaseAddrHi; - __be16 wBaseAddrLo; - __u8 bData[0]; -} __attribute__((packed)); - -struct ti_read_data_request { - __u8 bAddrType; - __u8 bDataType; - __u8 bDataCounter; - __be16 wBaseAddrHi; - __be16 wBaseAddrLo; -} __attribute__((packed)); - -struct ti_read_data_bytes { - __u8 bCmdCode; - __u8 bModuleId; - __u8 bErrorCode; - __u8 bData[0]; -} __attribute__((packed)); - -/* Interrupt struct */ -struct ti_interrupt { - __u8 bICode; - __u8 bIInfo; -} __attribute__((packed)); - -/* Interrupt codes */ -#define TI_GET_PORT_FROM_CODE(c) (((c) >> 4) - 3) -#define TI_GET_FUNC_FROM_CODE(c) ((c) & 0x0f) -#define TI_CODE_HARDWARE_ERROR 0xFF -#define TI_CODE_DATA_ERROR 0x03 -#define TI_CODE_MODEM_STATUS 0x04 - -/* Download firmware max packet size */ -#define TI_DOWNLOAD_MAX_PACKET_SIZE 64 - -/* Firmware image header */ -struct ti_firmware_header { - __le16 wLength; - __u8 bCheckSum; -} __attribute__((packed)); - -/* UART addresses */ -#define TI_UART1_BASE_ADDR 0xFFA0 /* UART 1 base address */ -#define TI_UART2_BASE_ADDR 0xFFB0 /* UART 2 base address */ -#define TI_UART_OFFSET_LCR 0x0002 /* UART MCR register offset */ -#define TI_UART_OFFSET_MCR 0x0004 /* UART MCR register offset */ - -#endif /* _TI_3410_5052_H_ */ diff --git a/fs/Kconfig b/fs/Kconfig index 4524916fa200..2bc7ad775842 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -70,6 +70,12 @@ config FS_POSIX_ACL config EXPORTFS tristate +config EXPORTFS_BLOCK_OPS + bool "Enable filesystem export operations for block IO" + help + This option enables the export operations for a filesystem to support + external block IO. + config FILE_LOCKING bool "Enable POSIX file locking API" if EXPERT default y diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 72c03354c14b..c7efddf6e038 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -89,7 +89,8 @@ config BINFMT_SCRIPT config BINFMT_FLAT bool "Kernel support for flat binaries" - depends on !MMU && (!FRV || BROKEN) + depends on !MMU || M68K + depends on !FRV || BROKEN help Support uClinux FLAT format binaries. diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 203589311bf8..464a972e88c1 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -67,8 +67,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *, struct mm_struct *, struct elf_fdpic_params *); #ifndef CONFIG_MMU -static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *, - unsigned long *); static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *, struct file *, struct mm_struct *); @@ -515,8 +513,9 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, sp = mm->start_stack; /* stack the program arguments and environment */ - if (elf_fdpic_transfer_args_to_stack(bprm, &sp) < 0) + if (transfer_args_to_stack(bprm, &sp) < 0) return -EFAULT; + sp &= ~15; #endif /* @@ -711,39 +710,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, /*****************************************************************************/ /* - * transfer the program arguments and environment from the holding pages onto - * the stack - */ -#ifndef CONFIG_MMU -static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, - unsigned long *_sp) -{ - unsigned long index, stop, sp; - char *src; - int ret = 0; - - stop = bprm->p >> PAGE_SHIFT; - sp = *_sp; - - for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { - src = kmap(bprm->page[index]); - sp -= PAGE_SIZE; - if (copy_to_user((void *) sp, src, PAGE_SIZE) != 0) - ret = -EFAULT; - kunmap(bprm->page[index]); - if (ret < 0) - goto out; - } - - *_sp = (*_sp - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p)) & ~15; - -out: - return ret; -} -#endif - -/*****************************************************************************/ -/* * load the appropriate binary image (executable or interpreter) into memory * - we assume no MMU is available * - if no other PIC bits are set in params->hdr->e_flags diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index caf9e39bb82b..9b2917a30294 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -15,7 +15,8 @@ * JAN/99 -- coded full program relocation (gerg@snapgear.com) */ -#include <linux/export.h> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/sched.h> #include <linux/mm.h> @@ -25,8 +26,6 @@ #include <linux/string.h> #include <linux/fs.h> #include <linux/file.h> -#include <linux/stat.h> -#include <linux/fcntl.h> #include <linux/ptrace.h> #include <linux/user.h> #include <linux/slab.h> @@ -34,26 +33,16 @@ #include <linux/personality.h> #include <linux/init.h> #include <linux/flat.h> -#include <linux/syscalls.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> #include <asm/byteorder.h> -#include <asm/uaccess.h> #include <asm/unaligned.h> #include <asm/cacheflush.h> #include <asm/page.h> /****************************************************************************/ -#if 0 -#define DEBUG 1 -#endif - -#ifdef DEBUG -#define DBG_FLT(a...) printk(a) -#else -#define DBG_FLT(a...) -#endif - /* * User data (data section and bss) needs to be aligned. * We pick 0x20 here because it is the max value elf2flt has always @@ -80,7 +69,7 @@ struct lib_info { unsigned long text_len; /* Length of text segment */ unsigned long entry; /* Start address for this module */ unsigned long build_date; /* When this one was compiled */ - short loaded; /* Has this library been loaded? */ + bool loaded; /* Has this library been loaded? */ } lib_list[MAX_SHARED_LIBS]; }; @@ -106,59 +95,67 @@ static struct linux_binfmt flat_format = { static int flat_core_dump(struct coredump_params *cprm) { - printk("Process %s:%d received signr %d and should have core dumped\n", - current->comm, current->pid, (int) cprm->siginfo->si_signo); - return(1); + pr_warn("Process %s:%d received signr %d and should have core dumped\n", + current->comm, current->pid, cprm->siginfo->si_signo); + return 1; } /****************************************************************************/ /* * create_flat_tables() parses the env- and arg-strings in new user * memory and creates the pointer tables from them, and puts their - * addresses on the "stack", returning the new stack pointer value. + * addresses on the "stack", recording the new stack pointer value. */ -static unsigned long create_flat_tables( - unsigned long pp, - struct linux_binprm * bprm) +static int create_flat_tables(struct linux_binprm *bprm, unsigned long arg_start) { - unsigned long *argv,*envp; - unsigned long * sp; - char * p = (char*)pp; - int argc = bprm->argc; - int envc = bprm->envc; - char uninitialized_var(dummy); - - sp = (unsigned long *)p; - sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); - sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN); - argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); - envp = argv + (argc + 1); + char __user *p; + unsigned long __user *sp; + long i, len; + + p = (char __user *)arg_start; + sp = (unsigned long __user *)current->mm->start_stack; + + sp -= bprm->envc + 1; + sp -= bprm->argc + 1; + sp -= flat_argvp_envp_on_stack() ? 2 : 0; + sp -= 1; /* &argc */ + current->mm->start_stack = (unsigned long)sp & -FLAT_STACK_ALIGN; + sp = (unsigned long __user *)current->mm->start_stack; + + __put_user(bprm->argc, sp++); if (flat_argvp_envp_on_stack()) { - put_user((unsigned long) envp, sp + 2); - put_user((unsigned long) argv, sp + 1); - } - - put_user(argc, sp); - current->mm->arg_start = (unsigned long) p; - while (argc-->0) { - put_user((unsigned long) p, argv++); - do { - get_user(dummy, p); p++; - } while (dummy); - } - put_user((unsigned long) NULL, argv); - current->mm->arg_end = current->mm->env_start = (unsigned long) p; - while (envc-->0) { - put_user((unsigned long)p, envp); envp++; - do { - get_user(dummy, p); p++; - } while (dummy); - } - put_user((unsigned long) NULL, envp); - current->mm->env_end = (unsigned long) p; - return (unsigned long)sp; + unsigned long argv, envp; + argv = (unsigned long)(sp + 2); + envp = (unsigned long)(sp + 2 + bprm->argc + 1); + __put_user(argv, sp++); + __put_user(envp, sp++); + } + + current->mm->arg_start = (unsigned long)p; + for (i = bprm->argc; i > 0; i--) { + __put_user((unsigned long)p, sp++); + len = strnlen_user(p, MAX_ARG_STRLEN); + if (!len || len > MAX_ARG_STRLEN) + return -EINVAL; + p += len; + } + __put_user(0, sp++); + current->mm->arg_end = (unsigned long)p; + + current->mm->env_start = (unsigned long) p; + for (i = bprm->envc; i > 0; i--) { + __put_user((unsigned long)p, sp++); + len = strnlen_user(p, MAX_ARG_STRLEN); + if (!len || len > MAX_ARG_STRLEN) + return -EINVAL; + p += len; + } + __put_user(0, sp++); + current->mm->env_end = (unsigned long)p; + + return 0; } /****************************************************************************/ @@ -190,17 +187,17 @@ static int decompress_exec( loff_t fpos; int ret, retval; - DBG_FLT("decompress_exec(offset=%x,buf=%x,len=%x)\n",(int)offset, (int)dst, (int)len); + pr_debug("decompress_exec(offset=%lx,buf=%p,len=%lx)\n", offset, dst, len); memset(&strm, 0, sizeof(strm)); strm.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL); if (strm.workspace == NULL) { - DBG_FLT("binfmt_flat: no memory for decompress workspace\n"); + pr_debug("no memory for decompress workspace\n"); return -ENOMEM; } buf = kmalloc(LBUFSIZE, GFP_KERNEL); if (buf == NULL) { - DBG_FLT("binfmt_flat: no memory for read buffer\n"); + pr_debug("no memory for read buffer\n"); retval = -ENOMEM; goto out_free; } @@ -218,49 +215,49 @@ static int decompress_exec( /* Check minimum size -- gzip header */ if (ret < 10) { - DBG_FLT("binfmt_flat: file too small?\n"); + pr_debug("file too small?\n"); goto out_free_buf; } /* Check gzip magic number */ if ((buf[0] != 037) || ((buf[1] != 0213) && (buf[1] != 0236))) { - DBG_FLT("binfmt_flat: unknown compression magic?\n"); + pr_debug("unknown compression magic?\n"); goto out_free_buf; } /* Check gzip method */ if (buf[2] != 8) { - DBG_FLT("binfmt_flat: unknown compression method?\n"); + pr_debug("unknown compression method?\n"); goto out_free_buf; } /* Check gzip flags */ if ((buf[3] & ENCRYPTED) || (buf[3] & CONTINUATION) || (buf[3] & RESERVED)) { - DBG_FLT("binfmt_flat: unknown flags?\n"); + pr_debug("unknown flags?\n"); goto out_free_buf; } ret = 10; if (buf[3] & EXTRA_FIELD) { ret += 2 + buf[10] + (buf[11] << 8); - if (unlikely(LBUFSIZE <= ret)) { - DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n"); + if (unlikely(ret >= LBUFSIZE)) { + pr_debug("buffer overflow (EXTRA)?\n"); goto out_free_buf; } } if (buf[3] & ORIG_NAME) { while (ret < LBUFSIZE && buf[ret++] != 0) ; - if (unlikely(LBUFSIZE == ret)) { - DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n"); + if (unlikely(ret == LBUFSIZE)) { + pr_debug("buffer overflow (ORIG_NAME)?\n"); goto out_free_buf; } } if (buf[3] & COMMENT) { while (ret < LBUFSIZE && buf[ret++] != 0) ; - if (unlikely(LBUFSIZE == ret)) { - DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n"); + if (unlikely(ret == LBUFSIZE)) { + pr_debug("buffer overflow (COMMENT)?\n"); goto out_free_buf; } } @@ -273,7 +270,7 @@ static int decompress_exec( strm.total_out = 0; if (zlib_inflateInit2(&strm, -MAX_WBITS) != Z_OK) { - DBG_FLT("binfmt_flat: zlib init failed?\n"); + pr_debug("zlib init failed?\n"); goto out_free_buf; } @@ -290,7 +287,7 @@ static int decompress_exec( } if (ret < 0) { - DBG_FLT("binfmt_flat: decompression failed (%d), %s\n", + pr_debug("decompression failed (%d), %s\n", ret, strm.msg); goto out_zlib; } @@ -327,24 +324,23 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) r &= 0x00ffffff; /* Trim ID off here */ } if (id >= MAX_SHARED_LIBS) { - printk("BINFMT_FLAT: reference 0x%x to shared library %d", - (unsigned) r, id); + pr_err("reference 0x%lx to shared library %d", r, id); goto failed; } if (curid != id) { if (internalp) { - printk("BINFMT_FLAT: reloc address 0x%x not in same module " - "(%d != %d)", (unsigned) r, curid, id); + pr_err("reloc address 0x%lx not in same module " + "(%d != %d)", r, curid, id); goto failed; - } else if ( ! p->lib_list[id].loaded && - load_flat_shared_library(id, p) < 0) { - printk("BINFMT_FLAT: failed to load library %d", id); + } else if (!p->lib_list[id].loaded && + load_flat_shared_library(id, p) < 0) { + pr_err("failed to load library %d", id); goto failed; } /* Check versioning information (i.e. time stamps) */ if (p->lib_list[id].build_date && p->lib_list[curid].build_date && p->lib_list[curid].build_date < p->lib_list[id].build_date) { - printk("BINFMT_FLAT: library %d is younger than %d", id, curid); + pr_err("library %d is younger than %d", id, curid); goto failed; } } @@ -358,8 +354,8 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) text_len = p->lib_list[id].text_len; if (!flat_reloc_valid(r, start_brk - start_data + text_len)) { - printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)", - (int) r,(int)(start_brk-start_data+text_len),(int)text_len); + pr_err("reloc outside program 0x%lx (0 - 0x%lx/0x%lx)", + r, start_brk-start_data+text_len, text_len); goto failed; } @@ -369,10 +365,10 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) addr = r - text_len + start_data; /* Range checked already above so doing the range tests is redundant...*/ - return(addr); + return addr; failed: - printk(", killing %s!\n", current->comm); + pr_cont(", killing %s!\n", current->comm); send_sig(SIGSEGV, current, 0); return RELOC_FAILED; @@ -382,62 +378,57 @@ failed: static void old_reloc(unsigned long rl) { -#ifdef DEBUG - char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" }; -#endif + static const char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" }; flat_v2_reloc_t r; - unsigned long *ptr; - + unsigned long __user *ptr; + unsigned long val; + r.value = rl; #if defined(CONFIG_COLDFIRE) - ptr = (unsigned long *) (current->mm->start_code + r.reloc.offset); + ptr = (unsigned long __user *)(current->mm->start_code + r.reloc.offset); #else - ptr = (unsigned long *) (current->mm->start_data + r.reloc.offset); + ptr = (unsigned long __user *)(current->mm->start_data + r.reloc.offset); #endif + get_user(val, ptr); + + pr_debug("Relocation of variable at DATASEG+%x " + "(address %p, currently %lx) into segment %s\n", + r.reloc.offset, ptr, val, segment[r.reloc.type]); -#ifdef DEBUG - printk("Relocation of variable at DATASEG+%x " - "(address %p, currently %x) into segment %s\n", - r.reloc.offset, ptr, (int)*ptr, segment[r.reloc.type]); -#endif - switch (r.reloc.type) { case OLD_FLAT_RELOC_TYPE_TEXT: - *ptr += current->mm->start_code; + val += current->mm->start_code; break; case OLD_FLAT_RELOC_TYPE_DATA: - *ptr += current->mm->start_data; + val += current->mm->start_data; break; case OLD_FLAT_RELOC_TYPE_BSS: - *ptr += current->mm->end_data; + val += current->mm->end_data; break; default: - printk("BINFMT_FLAT: Unknown relocation type=%x\n", r.reloc.type); + pr_err("Unknown relocation type=%x\n", r.reloc.type); break; } + put_user(val, ptr); -#ifdef DEBUG - printk("Relocation became %x\n", (int)*ptr); -#endif -} + pr_debug("Relocation became %lx\n", val); +} /****************************************************************************/ -static int load_flat_file(struct linux_binprm * bprm, +static int load_flat_file(struct linux_binprm *bprm, struct lib_info *libinfo, int id, unsigned long *extra_stack) { - struct flat_hdr * hdr; - unsigned long textpos = 0, datapos = 0, result; - unsigned long realdatastart = 0; - unsigned long text_len, data_len, bss_len, stack_len, flags; - unsigned long full_data; - unsigned long len, memp = 0; - unsigned long memp_size, extra, rlim; - unsigned long *reloc = 0, *rp; + struct flat_hdr *hdr; + unsigned long textpos, datapos, realdatastart; + unsigned long text_len, data_len, bss_len, stack_len, full_data, flags; + unsigned long len, memp, memp_size, extra, rlim; + unsigned long __user *reloc, *rp; struct inode *inode; - int i, rev, relocs = 0; + int i, rev, relocs; loff_t fpos; unsigned long start_code, end_code; + ssize_t result; int ret; hdr = ((struct flat_hdr *) bprm->buf); /* exec-header */ @@ -469,20 +460,30 @@ static int load_flat_file(struct linux_binprm * bprm, } if (flags & FLAT_FLAG_KTRACE) - printk("BINFMT_FLAT: Loading file: %s\n", bprm->filename); + pr_info("Loading file: %s\n", bprm->filename); if (rev != FLAT_VERSION && rev != OLD_FLAT_VERSION) { - printk("BINFMT_FLAT: bad flat file version 0x%x (supported " - "0x%lx and 0x%lx)\n", - rev, FLAT_VERSION, OLD_FLAT_VERSION); + pr_err("bad flat file version 0x%x (supported 0x%lx and 0x%lx)\n", + rev, FLAT_VERSION, OLD_FLAT_VERSION); ret = -ENOEXEC; goto err; } - + /* Don't allow old format executables to use shared libraries */ if (rev == OLD_FLAT_VERSION && id != 0) { - printk("BINFMT_FLAT: shared libraries are not available before rev 0x%x\n", - (int) FLAT_VERSION); + pr_err("shared libraries are not available before rev 0x%lx\n", + FLAT_VERSION); + ret = -ENOEXEC; + goto err; + } + + /* + * Make sure the header params are sane. + * 28 bits (256 MB) is way more than reasonable in this case. + * If some top bits are set we have probable binary corruption. + */ + if ((text_len | data_len | bss_len | stack_len | full_data) >> 28) { + pr_err("bad header\n"); ret = -ENOEXEC; goto err; } @@ -496,7 +497,7 @@ static int load_flat_file(struct linux_binprm * bprm, #ifndef CONFIG_BINFMT_ZFLAT if (flags & (FLAT_FLAG_GZIP|FLAT_FLAG_GZDATA)) { - printk("Support for ZFLAT executables is not enabled.\n"); + pr_err("Support for ZFLAT executables is not enabled.\n"); ret = -ENOEXEC; goto err; } @@ -517,11 +518,9 @@ static int load_flat_file(struct linux_binprm * bprm, /* Flush all traces of the currently running executable */ if (id == 0) { - result = flush_old_exec(bprm); - if (result) { - ret = result; + ret = flush_old_exec(bprm); + if (ret) goto err; - } /* OK, This is the point of no return */ set_personality(PER_LINUX_32BIT); @@ -539,48 +538,48 @@ static int load_flat_file(struct linux_binprm * bprm, * case, and then the fully copied to RAM case which lumps * it all together. */ - if ((flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP)) == 0) { + if (!IS_ENABLED(CONFIG_MMU) && !(flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP))) { /* * this should give us a ROM ptr, but if it doesn't we don't * really care */ - DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n"); + pr_debug("ROM mapping of file (we hope)\n"); textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_EXECUTABLE, 0); if (!textpos || IS_ERR_VALUE(textpos)) { - if (!textpos) - textpos = (unsigned long) -ENOMEM; - printk("Unable to mmap process text, errno %d\n", (int)-textpos); ret = textpos; + if (!textpos) + ret = -ENOMEM; + pr_err("Unable to mmap process text, errno %d\n", ret); goto err; } len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); len = PAGE_ALIGN(len); - realdatastart = vm_mmap(0, 0, len, + realdatastart = vm_mmap(NULL, 0, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) { + ret = realdatastart; if (!realdatastart) - realdatastart = (unsigned long) -ENOMEM; - printk("Unable to allocate RAM for process data, errno %d\n", - (int)-realdatastart); + ret = -ENOMEM; + pr_err("Unable to allocate RAM for process data, " + "errno %d\n", ret); vm_munmap(textpos, text_len); - ret = realdatastart; goto err; } datapos = ALIGN(realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long), FLAT_DATA_ALIGN); - DBG_FLT("BINFMT_FLAT: Allocated data+bss+stack (%d bytes): %x\n", - (int)(data_len + bss_len + stack_len), (int)datapos); + pr_debug("Allocated data+bss+stack (%ld bytes): %lx\n", + data_len + bss_len + stack_len, datapos); fpos = ntohl(hdr->data_start); #ifdef CONFIG_BINFMT_ZFLAT if (flags & FLAT_FLAG_GZDATA) { - result = decompress_exec(bprm, fpos, (char *) datapos, + result = decompress_exec(bprm, fpos, (char *)datapos, full_data, 0); } else #endif @@ -589,29 +588,30 @@ static int load_flat_file(struct linux_binprm * bprm, full_data); } if (IS_ERR_VALUE(result)) { - printk("Unable to read data+bss, errno %d\n", (int)-result); + ret = result; + pr_err("Unable to read data+bss, errno %d\n", ret); vm_munmap(textpos, text_len); vm_munmap(realdatastart, len); - ret = result; goto err; } - reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len)); + reloc = (unsigned long __user *) + (datapos + (ntohl(hdr->reloc_start) - text_len)); memp = realdatastart; memp_size = len; } else { len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); len = PAGE_ALIGN(len); - textpos = vm_mmap(0, 0, len, + textpos = vm_mmap(NULL, 0, len, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); if (!textpos || IS_ERR_VALUE(textpos)) { - if (!textpos) - textpos = (unsigned long) -ENOMEM; - printk("Unable to allocate RAM for process text/data, errno %d\n", - (int)-textpos); ret = textpos; + if (!textpos) + ret = -ENOMEM; + pr_err("Unable to allocate RAM for process text/data, " + "errno %d\n", ret); goto err; } @@ -620,7 +620,7 @@ static int load_flat_file(struct linux_binprm * bprm, MAX_SHARED_LIBS * sizeof(unsigned long), FLAT_DATA_ALIGN); - reloc = (unsigned long *) + reloc = (unsigned long __user *) (datapos + (ntohl(hdr->reloc_start) - text_len)); memp = textpos; memp_size = len; @@ -629,21 +629,59 @@ static int load_flat_file(struct linux_binprm * bprm, * load it all in and treat it like a RAM load from now on */ if (flags & FLAT_FLAG_GZIP) { - result = decompress_exec(bprm, sizeof (struct flat_hdr), - (((char *) textpos) + sizeof (struct flat_hdr)), +#ifndef CONFIG_MMU + result = decompress_exec(bprm, sizeof(struct flat_hdr), + (((char *)textpos) + sizeof(struct flat_hdr)), (text_len + full_data - - sizeof (struct flat_hdr)), + - sizeof(struct flat_hdr)), 0); memmove((void *) datapos, (void *) realdatastart, full_data); +#else + /* + * This is used on MMU systems mainly for testing. + * Let's use a kernel buffer to simplify things. + */ + long unz_text_len = text_len - sizeof(struct flat_hdr); + long unz_len = unz_text_len + full_data; + char *unz_data = vmalloc(unz_len); + if (!unz_data) { + result = -ENOMEM; + } else { + result = decompress_exec(bprm, sizeof(struct flat_hdr), + unz_data, unz_len, 0); + if (result == 0 && + (copy_to_user((void __user *)textpos + sizeof(struct flat_hdr), + unz_data, unz_text_len) || + copy_to_user((void __user *)datapos, + unz_data + unz_text_len, full_data))) + result = -EFAULT; + vfree(unz_data); + } +#endif } else if (flags & FLAT_FLAG_GZDATA) { result = read_code(bprm->file, textpos, 0, text_len); - if (!IS_ERR_VALUE(result)) + if (!IS_ERR_VALUE(result)) { +#ifndef CONFIG_MMU result = decompress_exec(bprm, text_len, (char *) datapos, full_data, 0); - } - else +#else + char *unz_data = vmalloc(full_data); + if (!unz_data) { + result = -ENOMEM; + } else { + result = decompress_exec(bprm, text_len, + unz_data, full_data, 0); + if (result == 0 && + copy_to_user((void __user *)datapos, + unz_data, full_data)) + result = -EFAULT; + vfree(unz_data); + } #endif + } + } else +#endif /* CONFIG_BINFMT_ZFLAT */ { result = read_code(bprm->file, textpos, 0, text_len); if (!IS_ERR_VALUE(result)) @@ -652,21 +690,19 @@ static int load_flat_file(struct linux_binprm * bprm, full_data); } if (IS_ERR_VALUE(result)) { - printk("Unable to read code+data+bss, errno %d\n",(int)-result); + ret = result; + pr_err("Unable to read code+data+bss, errno %d\n", ret); vm_munmap(textpos, text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long)); - ret = result; goto err; } } - if (flags & FLAT_FLAG_KTRACE) - printk("Mapping is %x, Entry point is %x, data_start is %x\n", - (int)textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start)); + start_code = textpos + sizeof(struct flat_hdr); + end_code = textpos + text_len; + text_len -= sizeof(struct flat_hdr); /* the real code len */ /* The main program needs a little extra setup in the task structure */ - start_code = textpos + sizeof (struct flat_hdr); - end_code = textpos + text_len; if (id == 0) { current->mm->start_code = start_code; current->mm->end_code = end_code; @@ -681,19 +717,19 @@ static int load_flat_file(struct linux_binprm * bprm, */ current->mm->start_brk = datapos + data_len + bss_len; current->mm->brk = (current->mm->start_brk + 3) & ~3; +#ifndef CONFIG_MMU current->mm->context.end_brk = memp + memp_size - stack_len; +#endif } - if (flags & FLAT_FLAG_KTRACE) - printk("%s %s: TEXT=%x-%x DATA=%x-%x BSS=%x-%x\n", + if (flags & FLAT_FLAG_KTRACE) { + pr_info("Mapping is %lx, Entry point is %x, data_start is %x\n", + textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start)); + pr_info("%s %s: TEXT=%lx-%lx DATA=%lx-%lx BSS=%lx-%lx\n", id ? "Lib" : "Load", bprm->filename, - (int) start_code, (int) end_code, - (int) datapos, - (int) (datapos + data_len), - (int) (datapos + data_len), - (int) (((datapos + data_len + bss_len) + 3) & ~3)); - - text_len -= sizeof(struct flat_hdr); /* the real code len */ + start_code, end_code, datapos, datapos + data_len, + datapos + data_len, (datapos + data_len + bss_len + 3) & ~3); + } /* Store the current module values into the global library structure */ libinfo->lib_list[id].start_code = start_code; @@ -703,7 +739,7 @@ static int load_flat_file(struct linux_binprm * bprm, libinfo->lib_list[id].loaded = 1; libinfo->lib_list[id].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos; libinfo->lib_list[id].build_date = ntohl(hdr->build_date); - + /* * We just load the allocations into some temporary memory to * help simplify all this mumbo jumbo @@ -717,15 +753,20 @@ static int load_flat_file(struct linux_binprm * bprm, * image. */ if (flags & FLAT_FLAG_GOTPIC) { - for (rp = (unsigned long *)datapos; *rp != 0xffffffff; rp++) { - unsigned long addr; - if (*rp) { - addr = calc_reloc(*rp, libinfo, id, 0); + for (rp = (unsigned long __user *)datapos; ; rp++) { + unsigned long addr, rp_val; + if (get_user(rp_val, rp)) + return -EFAULT; + if (rp_val == 0xffffffff) + break; + if (rp_val) { + addr = calc_reloc(rp_val, libinfo, id, 0); if (addr == RELOC_FAILED) { ret = -ENOEXEC; goto err; } - *rp = addr; + if (put_user(addr, rp)) + return -EFAULT; } } } @@ -742,19 +783,23 @@ static int load_flat_file(struct linux_binprm * bprm, * __start to address 4 so that is okay). */ if (rev > OLD_FLAT_VERSION) { - unsigned long persistent = 0; - for (i=0; i < relocs; i++) { + unsigned long __maybe_unused persistent = 0; + for (i = 0; i < relocs; i++) { unsigned long addr, relval; - /* Get the address of the pointer to be - relocated (of course, the address has to be - relocated first). */ - relval = ntohl(reloc[i]); - if (flat_set_persistent (relval, &persistent)) + /* + * Get the address of the pointer to be + * relocated (of course, the address has to be + * relocated first). + */ + if (get_user(relval, reloc + i)) + return -EFAULT; + relval = ntohl(relval); + if (flat_set_persistent(relval, &persistent)) continue; addr = flat_get_relocate_addr(relval); - rp = (unsigned long *) calc_reloc(addr, libinfo, id, 1); - if (rp == (unsigned long *)RELOC_FAILED) { + rp = (unsigned long __user *)calc_reloc(addr, libinfo, id, 1); + if (rp == (unsigned long __user *)RELOC_FAILED) { ret = -ENOEXEC; goto err; } @@ -780,17 +825,23 @@ static int load_flat_file(struct linux_binprm * bprm, } } } else { - for (i=0; i < relocs; i++) - old_reloc(ntohl(reloc[i])); + for (i = 0; i < relocs; i++) { + unsigned long relval; + if (get_user(relval, reloc + i)) + return -EFAULT; + relval = ntohl(relval); + old_reloc(relval); + } } - + flush_icache_range(start_code, end_code); /* zero the BSS, BRK and stack areas */ - memset((void*)(datapos + data_len), 0, bss_len + - (memp + memp_size - stack_len - /* end brk */ - libinfo->lib_list[id].start_brk) + /* start brk */ - stack_len); + if (clear_user((void __user *)(datapos + data_len), bss_len + + (memp + memp_size - stack_len - /* end brk */ + libinfo->lib_list[id].start_brk) + /* start brk */ + stack_len)) + return -EFAULT; return 0; err: @@ -846,7 +897,7 @@ out: allow_write_access(bprm.file); fput(bprm.file); - return(res); + return res; } #endif /* CONFIG_BINFMT_SHARED_FLAT */ @@ -857,18 +908,17 @@ out: * libraries. There is no binary dependent code anywhere else. */ -static int load_flat_binary(struct linux_binprm * bprm) +static int load_flat_binary(struct linux_binprm *bprm) { struct lib_info libinfo; struct pt_regs *regs = current_pt_regs(); - unsigned long p = bprm->p; - unsigned long stack_len; + unsigned long stack_len = 0; unsigned long start_addr; - unsigned long *sp; int res; int i, j; memset(&libinfo, 0, sizeof(libinfo)); + /* * We have to add the size of our arguments to our stack size * otherwise it's too easy for users to create stack overflows @@ -876,38 +926,54 @@ static int load_flat_binary(struct linux_binprm * bprm) * pedantic and include space for the argv/envp array as it may have * a lot of entries. */ -#define TOP_OF_ARGS (PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *)) - stack_len = TOP_OF_ARGS - bprm->p; /* the strings */ - stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */ - stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ - stack_len += FLAT_STACK_ALIGN - 1; /* reserve for upcoming alignment */ - +#ifndef CONFIG_MMU + stack_len += PAGE_SIZE * MAX_ARG_PAGES - bprm->p; /* the strings */ +#endif + stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */ + stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ + stack_len = ALIGN(stack_len, FLAT_STACK_ALIGN); + res = load_flat_file(bprm, &libinfo, 0, &stack_len); if (res < 0) return res; - + /* Update data segment pointers for all libraries */ - for (i=0; i<MAX_SHARED_LIBS; i++) - if (libinfo.lib_list[i].loaded) - for (j=0; j<MAX_SHARED_LIBS; j++) - (-(j+1))[(unsigned long *)(libinfo.lib_list[i].start_data)] = - (libinfo.lib_list[j].loaded)? - libinfo.lib_list[j].start_data:UNLOADED_LIB; + for (i = 0; i < MAX_SHARED_LIBS; i++) { + if (!libinfo.lib_list[i].loaded) + continue; + for (j = 0; j < MAX_SHARED_LIBS; j++) { + unsigned long val = libinfo.lib_list[j].loaded ? + libinfo.lib_list[j].start_data : UNLOADED_LIB; + unsigned long __user *p = (unsigned long __user *) + libinfo.lib_list[i].start_data; + p -= j + 1; + if (put_user(val, p)) + return -EFAULT; + } + } install_exec_creds(bprm); set_binfmt(&flat_format); - p = ((current->mm->context.end_brk + stack_len + 3) & ~3) - 4; - DBG_FLT("p=%x\n", (int)p); +#ifdef CONFIG_MMU + res = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); + if (!res) + res = create_flat_tables(bprm, bprm->p); +#else + /* Stash our initial stack pointer into the mm structure */ + current->mm->start_stack = + ((current->mm->context.end_brk + stack_len + 3) & ~3) - 4; + pr_debug("sp=%lx\n", current->mm->start_stack); - /* copy the arg pages onto the stack, this could be more efficient :-) */ - for (i = TOP_OF_ARGS - 1; i >= bprm->p; i--) - * (char *) --p = - ((char *) page_address(bprm->page[i/PAGE_SIZE]))[i % PAGE_SIZE]; + /* copy the arg pages onto the stack */ + res = transfer_args_to_stack(bprm, ¤t->mm->start_stack); + if (!res) + res = create_flat_tables(bprm, current->mm->start_stack); +#endif + if (res) + return res; - sp = (unsigned long *) create_flat_tables(p, bprm); - /* Fake some return addresses to ensure the call chain will * initialise library in order for us. We are required to call * lib 1 first, then 2, ... and finally the main program (id 0). @@ -915,24 +981,24 @@ static int load_flat_binary(struct linux_binprm * bprm) start_addr = libinfo.lib_list[0].entry; #ifdef CONFIG_BINFMT_SHARED_FLAT - for (i = MAX_SHARED_LIBS-1; i>0; i--) { + for (i = MAX_SHARED_LIBS-1; i > 0; i--) { if (libinfo.lib_list[i].loaded) { /* Push previos first to call address */ - --sp; put_user(start_addr, sp); + unsigned long __user *sp; + current->mm->start_stack -= sizeof(unsigned long); + sp = (unsigned long __user *)current->mm->start_stack; + __put_user(start_addr, sp); start_addr = libinfo.lib_list[i].entry; } } #endif - - /* Stash our initial stack pointer into the mm structure */ - current->mm->start_stack = (unsigned long )sp; #ifdef FLAT_PLAT_INIT FLAT_PLAT_INIT(regs); #endif - DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n", - (int)regs, (int)start_addr, (int)current->mm->start_stack); - + + pr_debug("start_thread(regs=0x%p, entry=0x%lx, start_stack=0x%lx)\n", + regs, start_addr, current->mm->start_stack); start_thread(regs, start_addr, current->mm->start_stack); return 0; @@ -945,9 +1011,6 @@ static int __init init_flat_binfmt(void) register_binfmt(&flat_format); return 0; } - -/****************************************************************************/ - core_initcall(init_flat_binfmt); /****************************************************************************/ diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 67a607709d4f..53bb7af4e5f0 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -55,8 +55,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) } if (size > 0) { acl = posix_acl_from_xattr(&init_user_ns, value, size); - } else if (size == -ENOENT || size == -ENODATA || size == 0) { - /* FIXME, who returns -ENOENT? I think nobody */ + } else if (size == -ERANGE || size == -ENODATA || size == 0) { acl = NULL; } else { acl = ERR_PTR(-EIO); diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 5fb60ea7eee2..e0f071f6b5a7 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -34,6 +34,10 @@ struct __btrfs_workqueue { struct workqueue_struct *normal_wq; + + /* File system this workqueue services */ + struct btrfs_fs_info *fs_info; + /* List head pointing to ordered work list */ struct list_head ordered_list; @@ -70,6 +74,18 @@ void btrfs_##name(struct work_struct *arg) \ normal_work_helper(work); \ } +struct btrfs_fs_info * +btrfs_workqueue_owner(struct __btrfs_workqueue *wq) +{ + return wq->fs_info; +} + +struct btrfs_fs_info * +btrfs_work_owner(struct btrfs_work *work) +{ + return work->wq->fs_info; +} + BTRFS_WORK_HELPER(worker_helper); BTRFS_WORK_HELPER(delalloc_helper); BTRFS_WORK_HELPER(flush_delalloc_helper); @@ -94,14 +110,15 @@ BTRFS_WORK_HELPER(scrubnc_helper); BTRFS_WORK_HELPER(scrubparity_helper); static struct __btrfs_workqueue * -__btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active, - int thresh) +__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, + unsigned int flags, int limit_active, int thresh) { struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; + ret->fs_info = fs_info; ret->limit_active = limit_active; atomic_set(&ret->pending, 0); if (thresh == 0) @@ -143,7 +160,8 @@ __btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active, static inline void __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); -struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, +struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, + const char *name, unsigned int flags, int limit_active, int thresh) @@ -153,7 +171,8 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, if (!ret) return NULL; - ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI, + ret->normal = __btrfs_alloc_workqueue(fs_info, name, + flags & ~WQ_HIGHPRI, limit_active, thresh); if (!ret->normal) { kfree(ret); @@ -161,8 +180,8 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, } if (flags & WQ_HIGHPRI) { - ret->high = __btrfs_alloc_workqueue(name, flags, limit_active, - thresh); + ret->high = __btrfs_alloc_workqueue(fs_info, name, flags, + limit_active, thresh); if (!ret->high) { __btrfs_destroy_workqueue(ret->normal); kfree(ret); diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index ad4d0647d1a6..8e52484cd461 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -21,6 +21,7 @@ #define __BTRFS_ASYNC_THREAD_ #include <linux/workqueue.h> +struct btrfs_fs_info; struct btrfs_workqueue; /* Internal use only */ struct __btrfs_workqueue; @@ -67,7 +68,8 @@ BTRFS_WORK_HELPER_PROTO(scrubnc_helper); BTRFS_WORK_HELPER_PROTO(scrubparity_helper); -struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, +struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, + const char *name, unsigned int flags, int limit_active, int thresh); @@ -80,4 +82,6 @@ void btrfs_queue_work(struct btrfs_workqueue *wq, void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); void btrfs_set_work_high_priority(struct btrfs_work *work); +struct btrfs_fs_info *btrfs_work_owner(struct btrfs_work *work); +struct btrfs_fs_info *btrfs_workqueue_owner(struct __btrfs_workqueue *wq); #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 8bb3509099e8..2b88439c2ee8 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -139,7 +139,7 @@ int __init btrfs_prelim_ref_init(void) btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref", sizeof(struct __prelim_ref), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_prelim_ref_cache) return -ENOMEM; @@ -361,7 +361,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - if (btrfs_test_is_dummy_root(root)) { + if (btrfs_is_testing(fs_info)) { srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -ENOENT; goto out; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index cefedabf0a92..029db6e1105c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -403,7 +403,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, } ret = btrfs_map_bio(root, bio, 0, 1); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + bio->bi_error = ret; + bio_endio(bio); + } bio_put(bio); @@ -434,7 +437,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, } ret = btrfs_map_bio(root, bio, 0, 1); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + bio->bi_error = ret; + bio_endio(bio); + } bio_put(bio); return 0; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a85cf7d23309..d1c56c94dd5a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1153,14 +1153,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -1198,7 +1198,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (last_ref) { ret = tree_mod_log_free_eb(root->fs_info, buf); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -1505,7 +1505,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return 0; /* ensure we can see the force_cow */ @@ -1771,6 +1771,14 @@ static noinline int generic_bin_search(struct extent_buffer *eb, unsigned long map_len = 0; int err; + if (low > high) { + btrfs_err(eb->fs_info, + "%s: low (%d) > high (%d) eb %llu owner %llu level %d", + __func__, low, high, eb->start, + btrfs_header_owner(eb), btrfs_header_level(eb)); + return -EINVAL; + } + while (low < high) { mid = (low + high) / 2; offset = p + mid * item_size; @@ -1858,7 +1866,6 @@ static void root_sub_used(struct btrfs_root *root, u32 size) /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). - * NULL is returned on error. */ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, struct extent_buffer *parent, int slot) @@ -1866,19 +1873,16 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, int level = btrfs_header_level(parent); struct extent_buffer *eb; - if (slot < 0) - return NULL; - if (slot >= btrfs_header_nritems(parent)) - return NULL; + if (slot < 0 || slot >= btrfs_header_nritems(parent)) + return ERR_PTR(-ENOENT); BUG_ON(level == 0); eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), btrfs_node_ptr_generation(parent, slot)); - if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) { - if (!IS_ERR(eb)) - free_extent_buffer(eb); - eb = NULL; + if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + eb = ERR_PTR(-EIO); } return eb; @@ -1931,8 +1935,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* promote the child to a root */ child = read_node_slot(root, mid, 0); - if (!child) { - ret = -EROFS; + if (IS_ERR(child)) { + ret = PTR_ERR(child); btrfs_handle_fs_error(root->fs_info, ret, NULL); goto enospc; } @@ -1970,6 +1974,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, return 0; left = read_node_slot(root, parent, pslot - 1); + if (IS_ERR(left)) + left = NULL; + if (left) { btrfs_tree_lock(left); btrfs_set_lock_blocking(left); @@ -1980,7 +1987,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } } + right = read_node_slot(root, parent, pslot + 1); + if (IS_ERR(right)) + right = NULL; + if (right) { btrfs_tree_lock(right); btrfs_set_lock_blocking(right); @@ -2135,6 +2146,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, return 1; left = read_node_slot(root, parent, pslot - 1); + if (IS_ERR(left)) + left = NULL; /* first, try to make some room in the middle buffer */ if (left) { @@ -2185,6 +2198,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, free_extent_buffer(left); } right = read_node_slot(root, parent, pslot + 1); + if (IS_ERR(right)) + right = NULL; /* * then try to empty the right most buffer into the middle @@ -3240,7 +3255,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, push_items); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } copy_extent_buffer(dst, src, @@ -3315,7 +3330,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0, src_nritems - push_items, push_items); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } copy_extent_buffer(dst, src, @@ -3519,7 +3534,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } copy_extent_buffer(split, c, @@ -3773,7 +3788,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); right = read_node_slot(root, upper, slot + 1); - if (right == NULL) + /* + * slot + 1 is not valid or we fail to read the right node, + * no big deal, just return. + */ + if (IS_ERR(right)) return 1; btrfs_tree_lock(right); @@ -4003,7 +4022,11 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); left = read_node_slot(root, path->nodes[1], slot - 1); - if (left == NULL) + /* + * slot - 1 is not valid or we fail to read the left node, + * no big deal, just return. + */ + if (IS_ERR(left)) return 1; btrfs_tree_lock(left); @@ -5210,7 +5233,10 @@ find_next_key: } btrfs_set_path_blocking(path); cur = read_node_slot(root, cur, slot); - BUG_ON(!cur); /* -ENOMEM */ + if (IS_ERR(cur)) { + ret = PTR_ERR(cur); + goto out; + } btrfs_tree_read_lock(cur); @@ -5229,15 +5255,21 @@ out: return ret; } -static void tree_move_down(struct btrfs_root *root, +static int tree_move_down(struct btrfs_root *root, struct btrfs_path *path, int *level, int root_level) { + struct extent_buffer *eb; + BUG_ON(*level == 0); - path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], - path->slots[*level]); + eb = read_node_slot(root, path->nodes[*level], path->slots[*level]); + if (IS_ERR(eb)) + return PTR_ERR(eb); + + path->nodes[*level - 1] = eb; path->slots[*level - 1] = 0; (*level)--; + return 0; } static int tree_move_next_or_upnext(struct btrfs_root *root, @@ -5282,8 +5314,7 @@ static int tree_advance(struct btrfs_root *root, if (*level == 0 || !allow_down) { ret = tree_move_next_or_upnext(root, path, level, root_level); } else { - tree_move_down(root, path, level, root_level); - ret = 0; + ret = tree_move_down(root, path, level, root_level); } if (ret >= 0) { if (*level == 0) @@ -5457,8 +5488,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root, left_root_level, advance_left != ADVANCE_ONLY_NEXT, &left_key); - if (ret < 0) + if (ret == -1) left_end_reached = ADVANCE; + else if (ret < 0) + goto out; advance_left = 0; } if (advance_right && !right_end_reached) { @@ -5466,8 +5499,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_root_level, advance_right != ADVANCE_ONLY_NEXT, &right_key); - if (ret < 0) + if (ret == -1) right_end_reached = ADVANCE; + else if (ret < 0) + goto out; advance_right = 0; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 443fcc402114..2fe8f89091a3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -117,6 +117,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) #define BTRFS_FS_STATE_REMOUNTING 1 #define BTRFS_FS_STATE_TRANS_ABORTED 2 #define BTRFS_FS_STATE_DEV_REPLACING 3 +#define BTRFS_FS_STATE_DUMMY_FS_INFO 4 #define BTRFS_BACKREF_REV_MAX 256 #define BTRFS_BACKREF_REV_SHIFT 56 @@ -144,21 +145,6 @@ struct btrfs_header { u8 level; } __attribute__ ((__packed__)); -#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \ - sizeof(struct btrfs_header)) / \ - sizeof(struct btrfs_key_ptr)) -#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) -#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize)) -#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ - (offsetof(struct btrfs_file_extent_item, disk_bytenr)) -#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ - sizeof(struct btrfs_item) - \ - BTRFS_FILE_EXTENT_INLINE_DATA_START) -#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ - sizeof(struct btrfs_item) -\ - sizeof(struct btrfs_dir_item)) - - /* * this is a very generous portion of the super block, giving us * room to translate 14 chunks with 3 stripes each. @@ -1114,12 +1100,11 @@ struct btrfs_subvolume_writers { #define BTRFS_ROOT_REF_COWS 1 #define BTRFS_ROOT_TRACK_DIRTY 2 #define BTRFS_ROOT_IN_RADIX 3 -#define BTRFS_ROOT_DUMMY_ROOT 4 -#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5 -#define BTRFS_ROOT_DEFRAG_RUNNING 6 -#define BTRFS_ROOT_FORCE_COW 7 -#define BTRFS_ROOT_MULTI_LOG_TASKS 8 -#define BTRFS_ROOT_DIRTY 9 +#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 4 +#define BTRFS_ROOT_DEFRAG_RUNNING 5 +#define BTRFS_ROOT_FORCE_COW 6 +#define BTRFS_ROOT_MULTI_LOG_TASKS 7 +#define BTRFS_ROOT_DIRTY 8 /* * in ram representation of the tree. extent_root is used for all allocations @@ -1181,8 +1166,10 @@ struct btrfs_root { u64 highest_objectid; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */ u64 alloc_bytenr; +#endif u64 defrag_trans_start; struct btrfs_key defrag_progress; @@ -1259,6 +1246,39 @@ struct btrfs_root { atomic_t qgroup_meta_rsv; }; +static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize) +{ + return blocksize - sizeof(struct btrfs_header); +} + +static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_root *root) +{ + return __BTRFS_LEAF_DATA_SIZE(root->nodesize); +} + +static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_root *root) +{ + return BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); +} + +static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_root *root) +{ + return BTRFS_LEAF_DATA_SIZE(root) / sizeof(struct btrfs_key_ptr); +} + +#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ + (offsetof(struct btrfs_file_extent_item, disk_bytenr)) +static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_root *root) +{ + return BTRFS_MAX_ITEM_SIZE(root) - + BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_root *root) +{ + return BTRFS_MAX_ITEM_SIZE(root) - sizeof(struct btrfs_dir_item); +} + /* * Flags for mount options. * @@ -1299,21 +1319,21 @@ struct btrfs_root { #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) -#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ +#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ BTRFS_MOUNT_##opt) -#define btrfs_set_and_info(root, opt, fmt, args...) \ +#define btrfs_set_and_info(fs_info, opt, fmt, args...) \ { \ - if (!btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_set_opt(root->fs_info->mount_opt, opt); \ + if (!btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_set_opt(fs_info->mount_opt, opt); \ } -#define btrfs_clear_and_info(root, opt, fmt, args...) \ +#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ { \ - if (btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_clear_opt(root->fs_info->mount_opt, opt); \ + if (btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_clear_opt(fs_info->mount_opt, opt); \ } #ifdef CONFIG_BTRFS_DEBUG @@ -1321,9 +1341,9 @@ static inline int btrfs_should_fragment_free_space(struct btrfs_root *root, struct btrfs_block_group_cache *block_group) { - return (btrfs_test_opt(root, FRAGMENT_METADATA) && + return (btrfs_test_opt(root->fs_info, FRAGMENT_METADATA) && block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || - (btrfs_test_opt(root, FRAGMENT_DATA) && + (btrfs_test_opt(root->fs_info, FRAGMENT_DATA) && block_group->flags & BTRFS_BLOCK_GROUP_DATA); } #endif @@ -2886,9 +2906,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); /* root-item.c */ -int btrfs_find_root_ref(struct btrfs_root *tree_root, - struct btrfs_path *path, - u64 root_id, u64 ref_id); int btrfs_add_root_ref(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, @@ -3362,23 +3379,23 @@ const char *btrfs_decode_error(int errno); __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *function, + const char *function, unsigned int line, int errno); /* * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact line number is reported. */ -#define btrfs_abort_transaction(trans, root, errno) \ +#define btrfs_abort_transaction(trans, errno) \ do { \ /* Report first abort since mount */ \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ - &((root)->fs_info->fs_state))) { \ + &((trans)->fs_info->fs_state))) { \ WARN(1, KERN_DEBUG \ "BTRFS: Transaction aborted (error %d)\n", \ (errno)); \ } \ - __btrfs_abort_transaction((trans), (root), __func__, \ + __btrfs_abort_transaction((trans), __func__, \ __LINE__, (errno)); \ } while (0) @@ -3610,13 +3627,13 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) void btrfs_test_destroy_inode(struct inode *inode); #endif -static inline int btrfs_test_is_dummy_root(struct btrfs_root *root) +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, + &fs_info->fs_state))) return 1; #endif return 0; } - #endif diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h new file mode 100644 index 000000000000..83ebfe28da9e --- /dev/null +++ b/fs/btrfs/dedupe.h @@ -0,0 +1,24 @@ +/* + * Copyright (C) 2016 Fujitsu. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_DEDUPE__ +#define __BTRFS_DEDUPE__ + +/* later in-band dedupe will expand this struct */ +struct btrfs_dedupe_hash; +#endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index dd3c040139a2..3eeb9cd8cfa5 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -34,7 +34,7 @@ int __init btrfs_delayed_inode_init(void) delayed_node_cache = kmem_cache_create("btrfs_delayed_node", sizeof(struct btrfs_delayed_node), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!delayed_node_cache) return -ENOMEM; @@ -1170,7 +1170,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, if (ret) { btrfs_release_delayed_node(curr_node); curr_node = NULL; - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 430b3689b112..b6d210e7a993 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -606,7 +606,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; - qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs, + qexisting = btrfs_qgroup_insert_dirty_extent(fs_info, + delayed_refs, qrecord); if (qexisting) kfree(qrecord); @@ -615,7 +616,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); - trace_add_delayed_ref_head(ref, head_ref, action); + trace_add_delayed_ref_head(fs_info, ref, head_ref, action); existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); @@ -682,7 +683,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->type = BTRFS_TREE_BLOCK_REF_KEY; full_ref->level = level; - trace_add_delayed_tree_ref(ref, full_ref, action); + trace_add_delayed_tree_ref(fs_info, ref, full_ref, action); ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); @@ -739,7 +740,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, full_ref->objectid = owner; full_ref->offset = offset; - trace_add_delayed_data_ref(ref, full_ref, action); + trace_add_delayed_data_ref(fs_info, ref, full_ref, action); ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); @@ -940,28 +941,28 @@ int btrfs_delayed_ref_init(void) btrfs_delayed_ref_head_cachep = kmem_cache_create( "btrfs_delayed_ref_head", sizeof(struct btrfs_delayed_ref_head), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_ref_head_cachep) goto fail; btrfs_delayed_tree_ref_cachep = kmem_cache_create( "btrfs_delayed_tree_ref", sizeof(struct btrfs_delayed_tree_ref), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_tree_ref_cachep) goto fail; btrfs_delayed_data_ref_cachep = kmem_cache_create( "btrfs_delayed_data_ref", sizeof(struct btrfs_delayed_data_ref), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_data_ref_cachep) goto fail; btrfs_delayed_extent_op_cachep = kmem_cache_create( "btrfs_delayed_extent_op", sizeof(struct btrfs_delayed_extent_op), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_extent_op_cachep) goto fail; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 63ef9cdf0144..e9bbff3c0029 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -142,7 +142,7 @@ no_valid_dev_replace_entry_found: * missing */ if (!dev_replace->srcdev && - !btrfs_test_opt(dev_root, DEGRADED)) { + !btrfs_test_opt(dev_root->fs_info, DEGRADED)) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); @@ -151,7 +151,7 @@ no_valid_dev_replace_entry_found: src_devid); } if (!dev_replace->tgtdev && - !btrfs_test_opt(dev_root, DEGRADED)) { + !btrfs_test_opt(dev_root->fs_info, DEGRADED)) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9a726ded2c6d..87dad552e39a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -101,7 +101,7 @@ int __init btrfs_end_io_wq_init(void) btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq", sizeof(struct btrfs_end_io_wq), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_end_io_wq_cache) return -ENOMEM; @@ -1140,7 +1140,7 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr) { - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return alloc_test_extent_buffer(root->fs_info, bytenr, root->nodesize); return alloc_extent_buffer(root->fs_info, bytenr); @@ -1227,6 +1227,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { + bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); root->node = NULL; root->commit_root = NULL; root->sectorsize = sectorsize; @@ -1281,14 +1282,14 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; - if (fs_info) + if (!dummy) extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); - if (fs_info) + if (!dummy) root->defrag_trans_start = fs_info->generation; else root->defrag_trans_start = 0; @@ -1309,17 +1310,20 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* Should only be used by the testing infrastructure */ -struct btrfs_root *btrfs_alloc_dummy_root(u32 sectorsize, u32 nodesize) +struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info, + u32 sectorsize, u32 nodesize) { struct btrfs_root *root; - root = btrfs_alloc_root(NULL, GFP_KERNEL); + if (!fs_info) + return ERR_PTR(-EINVAL); + + root = btrfs_alloc_root(fs_info, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); /* We don't use the stripesize in selftest, set it as sectorsize */ - __setup_root(nodesize, sectorsize, sectorsize, root, NULL, + __setup_root(nodesize, sectorsize, sectorsize, root, fs_info, BTRFS_ROOT_TREE_OBJECTID); - set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); root->alloc_bytenr = 0; return root; @@ -1594,14 +1598,14 @@ int btrfs_init_fs_root(struct btrfs_root *root) ret = get_anon_bdev(&root->anon_dev); if (ret) - goto free_writers; + goto fail; mutex_lock(&root->objectid_mutex); ret = btrfs_find_highest_objectid(root, &root->highest_objectid); if (ret) { mutex_unlock(&root->objectid_mutex); - goto free_root_dev; + goto fail; } ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); @@ -1609,14 +1613,8 @@ int btrfs_init_fs_root(struct btrfs_root *root) mutex_unlock(&root->objectid_mutex); return 0; - -free_root_dev: - free_anon_bdev(root->anon_dev); -free_writers: - btrfs_free_subvolume_writers(root->subv_writers); fail: - kfree(root->free_ino_ctl); - kfree(root->free_ino_pinned); + /* the caller is responsible to call free_fs_root */ return ret; } @@ -2310,17 +2308,19 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; fs_info->workers = - btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, - max_active, 16); + btrfs_alloc_workqueue(fs_info, "worker", + flags | WQ_HIGHPRI, max_active, 16); fs_info->delalloc_workers = - btrfs_alloc_workqueue("delalloc", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "delalloc", + flags, max_active, 2); fs_info->flush_workers = - btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "flush_delalloc", + flags, max_active, 0); fs_info->caching_workers = - btrfs_alloc_workqueue("cache", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0); /* * a higher idle thresh on the submit workers makes it much more @@ -2328,41 +2328,48 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, * devices */ fs_info->submit_workers = - btrfs_alloc_workqueue("submit", flags, + btrfs_alloc_workqueue(fs_info, "submit", flags, min_t(u64, fs_devices->num_devices, max_active), 64); fs_info->fixup_workers = - btrfs_alloc_workqueue("fixup", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0); /* * endios are largely parallel and should have a very * low idle thresh */ fs_info->endio_workers = - btrfs_alloc_workqueue("endio", flags, max_active, 4); + btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4); fs_info->endio_meta_workers = - btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); + btrfs_alloc_workqueue(fs_info, "endio-meta", flags, + max_active, 4); fs_info->endio_meta_write_workers = - btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags, + max_active, 2); fs_info->endio_raid56_workers = - btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, + max_active, 4); fs_info->endio_repair_workers = - btrfs_alloc_workqueue("endio-repair", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0); fs_info->rmw_workers = - btrfs_alloc_workqueue("rmw", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2); fs_info->endio_write_workers = - btrfs_alloc_workqueue("endio-write", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "endio-write", flags, + max_active, 2); fs_info->endio_freespace_worker = - btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "freespace-write", flags, + max_active, 0); fs_info->delayed_workers = - btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "delayed-meta", flags, + max_active, 0); fs_info->readahead_workers = - btrfs_alloc_workqueue("readahead", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "readahead", flags, + max_active, 2); fs_info->qgroup_rescan_workers = - btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); fs_info->extent_workers = - btrfs_alloc_workqueue("extent-refs", flags, + btrfs_alloc_workqueue(fs_info, "extent-refs", flags, min_t(u64, fs_devices->num_devices, max_active), 8); @@ -3010,8 +3017,8 @@ retry_root_backup: if (IS_ERR(fs_info->transaction_kthread)) goto fail_cleaner; - if (!btrfs_test_opt(tree_root, SSD) && - !btrfs_test_opt(tree_root, NOSSD) && + if (!btrfs_test_opt(tree_root->fs_info, SSD) && + !btrfs_test_opt(tree_root->fs_info, NOSSD) && !fs_info->fs_devices->rotating) { btrfs_info(fs_info, "detected SSD devices, enabling SSD mode"); btrfs_set_opt(fs_info->mount_opt, SSD); @@ -3024,9 +3031,9 @@ retry_root_backup: btrfs_apply_pending_changes(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { + if (btrfs_test_opt(tree_root->fs_info, CHECK_INTEGRITY)) { ret = btrfsic_mount(tree_root, fs_devices, - btrfs_test_opt(tree_root, + btrfs_test_opt(tree_root->fs_info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? 1 : 0, fs_info->check_integrity_print_mask); @@ -3042,7 +3049,7 @@ retry_root_backup: /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && - !btrfs_test_opt(tree_root, NOLOGREPLAY)) { + !btrfs_test_opt(tree_root->fs_info, NOLOGREPLAY)) { ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { err = ret; @@ -3083,7 +3090,7 @@ retry_root_backup: if (sb->s_flags & MS_RDONLY) return 0; - if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) && + if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { btrfs_info(fs_info, "creating free space tree"); ret = btrfs_create_free_space_tree(fs_info); @@ -3120,7 +3127,7 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); - if (btrfs_test_opt(tree_root, CLEAR_CACHE) && + if (btrfs_test_opt(tree_root->fs_info, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { btrfs_info(fs_info, "clearing free space tree"); ret = btrfs_clear_free_space_tree(fs_info); @@ -3141,7 +3148,7 @@ retry_root_backup: close_ctree(tree_root); return ret; } - } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) || + } else if (btrfs_test_opt(tree_root->fs_info, RESCAN_UUID_TREE) || fs_info->generation != btrfs_super_uuid_tree_generation(disk_super)) { btrfs_info(fs_info, "checking UUID tree"); @@ -3218,7 +3225,7 @@ fail: return err; recovery_tree_root: - if (!btrfs_test_opt(tree_root, USEBACKUPROOT)) + if (!btrfs_test_opt(tree_root->fs_info, USEBACKUPROOT)) goto fail_tree_roots; free_root_pointers(fs_info, 0); @@ -3634,7 +3641,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) int total_errors = 0; u64 flags; - do_barriers = !btrfs_test_opt(root, NOBARRIER); + do_barriers = !btrfs_test_opt(root->fs_info, NOBARRIER); backup_super_roots(root->fs_info); sb = root->fs_info->super_for_commit; @@ -3918,7 +3925,7 @@ void close_ctree(struct btrfs_root *root) iput(fs_info->btree_inode); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(root, CHECK_INTEGRITY)) + if (btrfs_test_opt(root->fs_info, CHECK_INTEGRITY)) btrfsic_unmount(root, fs_info->fs_devices); #endif diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index dbf3e1aab69e..b3207a0e09f7 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -90,7 +90,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, void btrfs_free_fs_root(struct btrfs_root *root); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -struct btrfs_root *btrfs_alloc_dummy_root(u32 sectorsize, u32 nodesize); +struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info, + u32 sectorsize, u32 nodesize); #endif /* diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e9376b1657e2..61b494e8e604 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2180,7 +2180,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, path, bytenr, parent, root_objectid, owner, offset, refs_to_add); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); return ret; @@ -2204,7 +2204,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; ref = btrfs_delayed_node_to_data_ref(node); - trace_run_delayed_data_ref(node, ref, node->action); + trace_run_delayed_data_ref(root->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; @@ -2359,7 +2359,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, SKINNY_METADATA); ref = btrfs_delayed_node_to_tree_ref(node); - trace_run_delayed_tree_ref(node, ref, node->action); + trace_run_delayed_tree_ref(root->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = ref->parent; @@ -2423,7 +2423,8 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, */ BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); - trace_run_delayed_ref_head(node, head, node->action); + trace_run_delayed_ref_head(root->fs_info, node, head, + node->action); if (insert_reserved) { btrfs_pin_extent(root, node->bytenr, @@ -2778,7 +2779,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes) u64 num_csums_per_leaf; u64 num_csums; - csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + csum_size = BTRFS_MAX_ITEM_SIZE(root); num_csums_per_leaf = div64_u64(csum_size, (u64)btrfs_super_csum_size(root->fs_info->super_copy)); num_csums = div64_u64(csum_bytes, root->sectorsize); @@ -2970,7 +2971,7 @@ again: trans->can_flush_pending_bgs = false; ret = __btrfs_run_delayed_refs(trans, root, count); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -3234,7 +3235,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, u64, u64, u64, u64, u64, u64); - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return 0; ref_root = btrfs_header_owner(buf); @@ -3429,7 +3430,7 @@ again: * transaction, this only happens in really bad situations * anyway. */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_put; } WARN_ON(ret); @@ -3447,7 +3448,7 @@ again: spin_lock(&block_group->lock); if (block_group->cached != BTRFS_CACHE_FINISHED || - !btrfs_test_opt(root, SPACE_CACHE)) { + !btrfs_test_opt(root->fs_info, SPACE_CACHE)) { /* * don't bother trying to write stuff out _if_ * a) we're not cached, @@ -3524,7 +3525,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, struct btrfs_path *path; if (list_empty(&cur_trans->dirty_bgs) || - !btrfs_test_opt(root, SPACE_CACHE)) + !btrfs_test_opt(root->fs_info, SPACE_CACHE)) return 0; path = btrfs_alloc_path(); @@ -3669,7 +3670,7 @@ again: } spin_unlock(&cur_trans->dirty_bgs_lock); } else if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } } @@ -3815,7 +3816,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, cache); } if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } /* if its not on the io list, we need to put the block group */ @@ -4443,7 +4444,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, thresh = btrfs_calc_trunc_metadata_size(root, num_devs) + btrfs_calc_trans_metadata_size(root, 1); - if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { + if (left < thresh && btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", left, thresh, type); dump_space_info(info, 0, 0); @@ -4588,7 +4589,7 @@ out: */ if (trans->can_flush_pending_bgs && trans->chunk_bytes_reserved >= (u64)SZ_2M) { - btrfs_create_pending_block_groups(trans, trans->root); + btrfs_create_pending_block_groups(trans, extent_root); btrfs_trans_release_chunk_metadata(trans); } return ret; @@ -5729,7 +5730,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, */ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) { - struct btrfs_fs_info *fs_info = trans->root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; if (!trans->chunk_bytes_reserved) return; @@ -6100,7 +6101,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return; trace_btrfs_space_reservation(root->fs_info, "delalloc", @@ -6215,7 +6216,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); - if (btrfs_test_opt(root, SPACE_CACHE) && + if (btrfs_test_opt(root->fs_info, SPACE_CACHE) && cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; @@ -6597,7 +6598,7 @@ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 *empty_cluster) { struct btrfs_free_cluster *ret = NULL; - bool ssd = btrfs_test_opt(root, SSD); + bool ssd = btrfs_test_opt(root->fs_info, SSD); *empty_cluster = 0; if (btrfs_mixed_space_info(space_info)) @@ -6742,7 +6743,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, break; } - if (btrfs_test_opt(root, DISCARD)) + if (btrfs_test_opt(root->fs_info, DISCARD)) ret = btrfs_discard_extent(root, start, end + 1 - start, NULL); @@ -6880,7 +6881,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, NULL, refs_to_drop, is_data, &last_ref); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); @@ -6929,7 +6930,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, path->nodes[0]); } if (ret < 0) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } extent_slot = path->slots[0]; @@ -6940,10 +6941,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", bytenr, parent, root_objectid, owner_objectid, owner_offset); - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } else { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -6955,7 +6956,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = convert_extent_item_v0(trans, extent_root, path, owner_objectid, 0); if (ret < 0) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -6974,7 +6975,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); } if (ret < 0) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -6999,7 +7000,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_err(info, "trying to drop %d refs but we only have %Lu " "for bytenr %Lu", refs_to_drop, refs, bytenr); ret = -EINVAL; - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } refs -= refs_to_drop; @@ -7022,7 +7023,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, iref, refs_to_drop, is_data, &last_ref); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -7045,7 +7046,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); @@ -7053,7 +7054,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -7061,13 +7062,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = add_to_free_space_tree(trans, root->fs_info, bytenr, num_bytes); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } ret = update_block_group(trans, root, bytenr, num_bytes, 0); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -7216,7 +7217,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, int ret; struct btrfs_fs_info *fs_info = root->fs_info; - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(fs_info)) return 0; add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); @@ -7851,8 +7852,7 @@ loop: * can do more things. */ if (ret < 0 && ret != -ENOSPC) - btrfs_abort_transaction(trans, - root, ret); + btrfs_abort_transaction(trans, ret); else ret = 0; if (!exist) @@ -7906,8 +7906,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", info->flags, info->total_bytes - info->bytes_used - info->bytes_pinned - - info->bytes_reserved - info->bytes_readonly, - (info->full) ? "" : "not "); + info->bytes_reserved - info->bytes_readonly - + info->bytes_may_use, (info->full) ? "" : "not "); printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " "reserved=%llu, may_use=%llu, readonly=%llu\n", info->total_bytes, info->bytes_used, info->bytes_pinned, @@ -7961,7 +7961,7 @@ again: if (num_bytes == min_alloc_size) final_tried = true; goto again; - } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + } else if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; sinfo = __find_space_info(root->fs_info, flags); @@ -7992,7 +7992,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, if (pin) pin_down_extent(root, cache, start, len, 1); else { - if (btrfs_test_opt(root, DISCARD)) + if (btrfs_test_opt(root->fs_info, DISCARD)) ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); @@ -8300,7 +8300,7 @@ again: goto again; } - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL * 10, /*DEFAULT_RATELIMIT_BURST*/ 1); @@ -8354,13 +8354,15 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); - if (btrfs_test_is_dummy_root(root)) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (btrfs_is_testing(root->fs_info)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, level); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; } +#endif block_rsv = use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) @@ -8540,7 +8542,8 @@ static int record_one_subtree_extent(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord)) + if (btrfs_qgroup_insert_dirty_extent(trans->fs_info, + delayed_refs, qrecord)) kfree(qrecord); spin_unlock(&delayed_refs->lock); @@ -9325,7 +9328,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, &root->root_key, root_item); if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -9352,7 +9355,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_del_root(trans, tree_root, &root->root_key); if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -9360,7 +9363,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_find_root(tree_root, &root->root_key, path, NULL, NULL); if (ret < 0) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } else if (ret > 0) { @@ -9731,7 +9734,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) int full = 0; int ret = 0; - debug = btrfs_test_opt(root, ENOSPC_DEBUG); + debug = btrfs_test_opt(root->fs_info, ENOSPC_DEBUG); block_group = btrfs_lookup_block_group(root->fs_info, bytenr); @@ -9887,7 +9890,22 @@ static int find_first_block_group(struct btrfs_root *root, if (found_key.objectid >= key->objectid && found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - ret = 0; + struct extent_map_tree *em_tree; + struct extent_map *em; + + em_tree = &root->fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, found_key.objectid, + found_key.offset); + read_unlock(&em_tree->lock); + if (!em) { + btrfs_err(root->fs_info, + "logical %llu len %llu found bg but no related chunk", + found_key.objectid, found_key.offset); + ret = -ENOENT; + } else { + ret = 0; + } goto out; } path->slots[0]++; @@ -10129,10 +10147,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) path->reada = READA_FORWARD; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); - if (btrfs_test_opt(root, SPACE_CACHE) && + if (btrfs_test_opt(root->fs_info, SPACE_CACHE) && btrfs_super_generation(root->fs_info->super_copy) != cache_gen) need_clear = 1; - if (btrfs_test_opt(root, CLEAR_CACHE)) + if (btrfs_test_opt(root->fs_info, CLEAR_CACHE)) need_clear = 1; while (1) { @@ -10163,7 +10181,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) * b) Setting 'dirty flag' makes sure that we flush * the new space cache info onto disk. */ - if (btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(root->fs_info, SPACE_CACHE)) cache->disk_cache_state = BTRFS_DC_CLEAR; } @@ -10305,11 +10323,11 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, ret = btrfs_insert_item(trans, extent_root, &key, &item, sizeof(item)); if (ret) - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); ret = btrfs_finish_chunk_alloc(trans, extent_root, key.objectid, key.offset); if (ret) - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, root->fs_info, block_group); /* already aborted the transaction if it failed. */ next: @@ -10622,7 +10640,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&block_group->space_info->lock); list_del_init(&block_group->ro_list); - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { WARN_ON(block_group->space_info->total_bytes < block_group->key.offset); WARN_ON(block_group->space_info->bytes_readonly @@ -10890,7 +10908,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_unlock(&space_info->lock); /* DISCARD can flip during remount */ - trimming = btrfs_test_opt(root, DISCARD); + trimming = btrfs_test_opt(root->fs_info, DISCARD); /* Implicit trim during transaction commit. */ if (trimming) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 881eb4667051..44fe66b53c8b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -163,13 +163,13 @@ int __init extent_io_init(void) { extent_state_cache = kmem_cache_create("btrfs_extent_state", sizeof(struct extent_state), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) goto free_state_cache; @@ -2750,7 +2750,6 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page, if (tree->ops && tree->ops->merge_bio_hook) ret = tree->ops->merge_bio_hook(page, offset, size, bio, bio_flags); - BUG_ON(ret < 0); return ret; } @@ -2873,6 +2872,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * into the tree that are removed when the IO is done (by the end_io * handlers) * XXX JDM: This needs looking at to ensure proper page locking + * return 0 on success, otherwise return error */ static int __do_readpage(struct extent_io_tree *tree, struct page *page, @@ -2894,7 +2894,7 @@ static int __do_readpage(struct extent_io_tree *tree, sector_t sector; struct extent_map *em; struct block_device *bdev; - int ret; + int ret = 0; int nr = 0; size_t pg_offset = 0; size_t iosize; @@ -3075,6 +3075,7 @@ static int __do_readpage(struct extent_io_tree *tree, } else { SetPageError(page); unlock_extent(tree, cur, cur + iosize - 1); + goto out; } cur = cur + iosize; pg_offset += iosize; @@ -3085,7 +3086,7 @@ out: SetPageUptodate(page); unlock_page(page); } - return 0; + return ret; } static inline void __do_contiguous_readpages(struct extent_io_tree *tree, @@ -5224,14 +5225,31 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { page = eb->pages[i]; + if (!PageUptodate(page)) { + if (ret) { + atomic_dec(&eb->io_pages); + unlock_page(page); + continue; + } + ClearPageError(page); err = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, &bio_flags, REQ_META); - if (err) + if (err) { ret = err; + /* + * We use &bio in above __extent_read_full_page, + * so we ensure that if it returns error, the + * current page fails to add itself to bio and + * it's been unlocked. + * + * We must dec io_pages by ourselves. + */ + atomic_dec(&eb->io_pages); + } } else { unlock_page(page); } diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index e0715fcfb11e..26f9ac719d20 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -13,7 +13,7 @@ int __init extent_map_init(void) { extent_map_cache = kmem_cache_create("btrfs_extent_map", sizeof(struct extent_map), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!extent_map_cache) return -ENOMEM; return 0; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 62a81ee13a5f..d0d571c47d33 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -250,7 +250,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, offset + root->sectorsize - 1, EXTENT_NODATASUM); } else { - btrfs_info(BTRFS_I(inode)->root->fs_info, + btrfs_info_rl(BTRFS_I(inode)->root->fs_info, "no csum found for inode %llu start %llu", btrfs_ino(inode), offset); } @@ -699,7 +699,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, */ ret = btrfs_split_item(trans, root, path, &key, offset); if (ret && ret != -EAGAIN) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bcfb4a27ddd4..9404121fd5f7 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -132,7 +132,7 @@ static int __btrfs_add_inode_defrag(struct inode *inode, static inline int __need_auto_defrag(struct btrfs_root *root) { - if (!btrfs_test_opt(root, AUTO_DEFRAG)) + if (!btrfs_test_opt(root->fs_info, AUTO_DEFRAG)) return 0; if (btrfs_fs_closing(root->fs_info)) @@ -950,7 +950,7 @@ delete_extent_item: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } @@ -974,7 +974,7 @@ delete_extent_item: path->slots[0] = del_slot; ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } leaf = path->nodes[0]; @@ -1190,7 +1190,7 @@ again: goto again; } if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -1278,7 +1278,7 @@ again: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -2975,7 +2975,7 @@ int btrfs_auto_defrag_init(void) { btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", sizeof(struct inode_defrag), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_inode_defrag_cachep) return -ENOMEM; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 69d270f6602c..d571bd2b697b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -280,7 +280,7 @@ fail: if (locked) mutex_unlock(&trans->transaction->cache_write_mutex); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -3026,7 +3026,7 @@ int btrfs_find_space_cluster(struct btrfs_root *root, * For metadata, allow allocates with smaller extents. For * data, keep it dense. */ - if (btrfs_test_opt(root, SSD_SPREAD)) { + if (btrfs_test_opt(root->fs_info, SSD_SPREAD)) { cont1_bytes = min_bytes = bytes + empty_size; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { cont1_bytes = bytes; @@ -3470,7 +3470,7 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) int ret = 0; u64 root_gen = btrfs_root_generation(&root->root_item); - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; /* @@ -3514,7 +3514,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, struct btrfs_io_ctl io_ctl; bool release_metadata = true; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; memset(&io_ctl, 0, sizeof(io_ctl)); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 53dbeaf6ce94..87e7e3d3e676 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -305,7 +305,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, out: kvfree(bitmap); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -454,7 +454,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, out: kvfree(bitmap); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -851,7 +851,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); if (ret) - btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -1047,7 +1047,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); if (ret) - btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -1193,7 +1193,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) abort: fs_info->creating_free_space_tree = 0; - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, tree_root); return ret; } @@ -1280,7 +1280,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) return 0; abort: - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, tree_root); return ret; } @@ -1333,7 +1333,7 @@ out: btrfs_free_path(path); mutex_unlock(&block_group->free_space_lock); if (ret) - btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -1410,7 +1410,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 70107f7c9307..aa6fabaee72e 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -38,7 +38,7 @@ static int caching_kthread(void *data) int slot; int ret; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; path = btrfs_alloc_path(); @@ -141,7 +141,7 @@ static void start_caching(struct btrfs_root *root) int ret; u64 objectid; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return; spin_lock(&root->ino_cache_lock); @@ -185,7 +185,7 @@ static void start_caching(struct btrfs_root *root) int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) { - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return btrfs_find_free_objectid(root, objectid); again: @@ -211,7 +211,7 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid) { struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return; again: if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { @@ -251,7 +251,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) struct rb_node *n; u64 count; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return; while (1) { @@ -412,7 +412,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, if (btrfs_root_refs(&root->root_item) == 0) return 0; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; path = btrfs_alloc_path(); @@ -458,7 +458,7 @@ again: BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, root, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_put; } @@ -466,7 +466,7 @@ again: ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); if (ret) { if (ret != -ENOSPC) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_put; } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8078077d1090..b0f421f332ae 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -60,6 +60,7 @@ #include "hash.h" #include "props.h" #include "qgroup.h" +#include "dedupe.h" struct btrfs_iget_args { struct btrfs_key *location; @@ -105,8 +106,9 @@ static int btrfs_truncate(struct inode *inode); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock); + u64 start, u64 end, u64 delalloc_end, + int *page_started, unsigned long *nr_written, + int unlock, struct btrfs_dedupe_hash *hash); static struct extent_map *create_pinned_em(struct inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, @@ -294,7 +296,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, start, aligned_end, NULL, 1, 1, extent_item_size, &extent_inserted); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -305,7 +307,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, inline_len, compressed_size, compress_type, compressed_pages); if (ret && ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { ret = 1; @@ -374,12 +376,12 @@ static inline int inode_need_compress(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; /* force compress */ - if (btrfs_test_opt(root, FORCE_COMPRESS)) + if (btrfs_test_opt(root->fs_info, FORCE_COMPRESS)) return 1; /* bad compression ratios */ if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) return 0; - if (btrfs_test_opt(root, COMPRESS) || + if (btrfs_test_opt(root->fs_info, COMPRESS) || BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || BTRFS_I(inode)->force_compress) return 1; @@ -585,9 +587,27 @@ cont: will_compress = 0; } else { num_bytes = total_in; + *num_added += 1; + + /* + * The async work queues will take care of doing actual + * allocation on disk for these compressed pages, and + * will submit them to the elevator. + */ + add_async_extent(async_cow, start, num_bytes, + total_compressed, pages, nr_pages_ret, + compress_type); + + if (start + num_bytes < end) { + start += num_bytes; + pages = NULL; + cond_resched(); + goto again; + } + return; } } - if (!will_compress && pages) { + if (pages) { /* * the compression code ran but failed to make things smaller, * free any pages it allocated and our page pointer array @@ -602,48 +622,28 @@ cont: nr_pages_ret = 0; /* flag the file so we don't compress in the future */ - if (!btrfs_test_opt(root, FORCE_COMPRESS) && + if (!btrfs_test_opt(root->fs_info, FORCE_COMPRESS) && !(BTRFS_I(inode)->force_compress)) { BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; } } - if (will_compress) { - *num_added += 1; - - /* the async work queues will take care of doing actual - * allocation on disk for these compressed pages, - * and will submit them to the elevator. - */ - add_async_extent(async_cow, start, num_bytes, - total_compressed, pages, nr_pages_ret, - compress_type); - - if (start + num_bytes < end) { - start += num_bytes; - pages = NULL; - cond_resched(); - goto again; - } - } else { cleanup_and_bail_uncompressed: - /* - * No compression, but we still need to write the pages in - * the file we've been given so far. redirty the locked - * page if it corresponds to our extent and set things up - * for the async work queue to run cow_file_range to do - * the normal delalloc dance - */ - if (page_offset(locked_page) >= start && - page_offset(locked_page) <= end) { - __set_page_dirty_nobuffers(locked_page); - /* unlocked later on in the async handlers */ - } - if (redirty) - extent_range_redirty_for_io(inode, start, end); - add_async_extent(async_cow, start, end - start + 1, - 0, NULL, 0, BTRFS_COMPRESS_NONE); - *num_added += 1; - } + /* + * No compression, but we still need to write the pages in the file + * we've been given so far. redirty the locked page if it corresponds + * to our extent and set things up for the async work queue to run + * cow_file_range to do the normal delalloc dance. + */ + if (page_offset(locked_page) >= start && + page_offset(locked_page) <= end) + __set_page_dirty_nobuffers(locked_page); + /* unlocked later on in the async handlers */ + + if (redirty) + extent_range_redirty_for_io(inode, start, end); + add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0, + BTRFS_COMPRESS_NONE); + *num_added += 1; return; @@ -712,7 +712,10 @@ retry: async_extent->start, async_extent->start + async_extent->ram_size - 1, - &page_started, &nr_written, 0); + async_extent->start + + async_extent->ram_size - 1, + &page_started, &nr_written, 0, + NULL); /* JDM XXX */ @@ -925,9 +928,9 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, */ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - int unlock) + u64 start, u64 end, u64 delalloc_end, + int *page_started, unsigned long *nr_written, + int unlock, struct btrfs_dedupe_hash *hash) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 alloc_hint = 0; @@ -1156,7 +1159,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->start = start; if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && - !btrfs_test_opt(root, FORCE_COMPRESS)) + !btrfs_test_opt(root->fs_info, FORCE_COMPRESS)) cur_end = end; else cur_end = min(end, start + SZ_512K - 1); @@ -1418,7 +1421,8 @@ out_check: if (cow_start != (u64)-1) { ret = cow_file_range(inode, locked_page, cow_start, found_key.offset - 1, - page_started, nr_written, 1); + end, page_started, nr_written, 1, + NULL); if (ret) { if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); @@ -1501,8 +1505,8 @@ out_check: } if (cow_start != (u64)-1) { - ret = cow_file_range(inode, locked_page, cow_start, end, - page_started, nr_written, 1); + ret = cow_file_range(inode, locked_page, cow_start, end, end, + page_started, nr_written, 1, NULL); if (ret) goto error; } @@ -1561,8 +1565,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); } else if (!inode_need_compress(inode)) { - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); + ret = cow_file_range(inode, locked_page, start, end, end, + page_started, nr_written, 1, NULL); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags); @@ -1740,7 +1744,7 @@ static void btrfs_set_bit_hook(struct inode *inode, } /* For sanity tests */ - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return; __percpu_counter_add(&root->fs_info->delalloc_bytes, len, @@ -1799,7 +1803,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, btrfs_delalloc_release_metadata(inode, len); /* For sanity tests. */ - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return; if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID @@ -1822,6 +1826,10 @@ static void btrfs_clear_bit_hook(struct inode *inode, /* * extent_io.c merge_bio_hook, this must check the chunk tree to make sure * we don't create bios that span stripes or chunks + * + * return 1 if page cannot be merged to bio + * return 0 if page can be merged to bio + * return error otherwise */ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, @@ -1840,8 +1848,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, map_length = length; ret = btrfs_map_block(root->fs_info, bio_op(bio), logical, &map_length, NULL, 0); - /* Will always return 0 with map_multi == NULL */ - BUG_ON(ret < 0); + if (ret < 0) + return ret; if (map_length < length + size) return 1; return 0; @@ -2594,7 +2602,7 @@ again: ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -2621,7 +2629,7 @@ again: backref->root_id, backref->inum, new->file_pos); /* start - extent_offset */ if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -2890,7 +2898,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2950,7 +2958,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset, ordered_extent->len, trans->transid); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_unlock; } @@ -2960,7 +2968,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) btrfs_ordered_update_i_size(inode, 0, ordered_extent); ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_unlock; } ret = 0; @@ -3204,7 +3212,7 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); else clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); @@ -3295,7 +3303,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) if (ret != -EEXIST) { clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -3307,7 +3315,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); if (ret && ret != -EEXIST) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -4006,20 +4014,20 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, btrfs_info(root->fs_info, "failed to delete reference to %.*s, inode %llu parent %llu", name_len, name, ino, dir_ino); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto err; } skip_backref: ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto err; } ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, dir_ino); if (ret != 0 && ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto err; } @@ -4028,7 +4036,7 @@ skip_backref: if (ret == -ENOENT) ret = 0; else if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err: btrfs_free_path(path); if (ret) @@ -4142,7 +4150,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); @@ -4152,7 +4160,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, dir_ino, &index, name, name_len); if (ret < 0) { if (ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } di = btrfs_search_dir_index_item(root, path, dir_ino, @@ -4162,7 +4170,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = -ENOENT; else ret = PTR_ERR(di); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -4175,7 +4183,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -4184,7 +4192,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); ret = btrfs_update_inode_fallback(trans, root, dir); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); return ret; @@ -4505,7 +4513,6 @@ search_again: pending_del_nr); if (err) { btrfs_abort_transaction(trans, - root, err); goto error; } @@ -4517,8 +4524,7 @@ search_again: item_end, new_size); if (err) { - btrfs_abort_transaction(trans, - root, err); + btrfs_abort_transaction(trans, err); goto error; } } else if (test_bit(BTRFS_ROOT_REF_COWS, @@ -4582,8 +4588,7 @@ delete: pending_del_slot, pending_del_nr); if (ret) { - btrfs_abort_transaction(trans, - root, ret); + btrfs_abort_transaction(trans, ret); goto error; } pending_del_nr = 0; @@ -4616,7 +4621,7 @@ out: ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } error: if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) @@ -4785,7 +4790,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); return ret; } @@ -4793,7 +4798,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, 0, 0, len, 0, len, 0, 0, 0); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); else btrfs_update_inode(trans, root, inode); btrfs_end_transaction(trans, root); @@ -5020,7 +5025,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) i_size_write(inode, BTRFS_I(inode)->disk_i_size); err = btrfs_orphan_del(trans, inode); if (err) - btrfs_abort_transaction(trans, root, err); + btrfs_abort_transaction(trans, err); btrfs_end_transaction(trans, root); } } @@ -5158,11 +5163,18 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv, *global_rsv; int steal_from_global = 0; - u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); + u64 min_size; int ret; trace_btrfs_inode_evict(inode); + if (!root) { + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); + return; + } + + min_size = btrfs_calc_trunc_metadata_size(root, 1); + evict_inode_truncate_pages(inode); if (inode->i_nlink && @@ -6239,9 +6251,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_inherit_iflags(inode, dir); if (S_ISREG(mode)) { - if (btrfs_test_opt(root, NODATASUM)) + if (btrfs_test_opt(root->fs_info, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(root->fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; } @@ -6319,7 +6331,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -6330,7 +6342,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, current_fs_time(parent_inode->i_sb); ret = btrfs_update_inode(trans, root, parent_inode); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; fail_dir_item: @@ -9385,25 +9397,25 @@ int btrfs_init_cachep(void) btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", sizeof(struct btrfs_trans_handle), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); if (!btrfs_trans_handle_cachep) goto fail; btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", sizeof(struct btrfs_transaction), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); if (!btrfs_transaction_cachep) goto fail; btrfs_path_cachep = kmem_cache_create("btrfs_path", sizeof(struct btrfs_path), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_path_cachep) goto fail; btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", sizeof(struct btrfs_free_space), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_free_space_cachep) goto fail; @@ -9553,7 +9565,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = btrfs_update_inode(trans, root, old_inode); } if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9573,7 +9585,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = btrfs_update_inode(trans, dest, new_inode); } if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9581,7 +9593,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, new_dentry->d_name.name, new_dentry->d_name.len, 0, old_idx); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9589,7 +9601,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, old_dentry->d_name.name, old_dentry->d_name.len, 0, new_idx); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9828,7 +9840,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, ret = btrfs_update_inode(trans, root, old_inode); } if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9852,7 +9864,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, d_inode(new_dentry)); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } } @@ -9861,7 +9873,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dentry->d_name.name, new_dentry->d_name.len, 0, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9881,7 +9893,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } } @@ -10307,7 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (ret) { btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); if (own_trans) btrfs_end_transaction(trans, root); break; @@ -10367,7 +10379,7 @@ next: ret = btrfs_update_inode(trans, root, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); if (own_trans) btrfs_end_transaction(trans, root); break; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 05173563e4a6..14ed1e9e6bc8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -561,7 +561,7 @@ static noinline int create_subvol(struct inode *dir, new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -570,7 +570,7 @@ static noinline int create_subvol(struct inode *dir, ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); if (ret) { /* We potentially lose an unused inode item here */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -583,7 +583,7 @@ static noinline int create_subvol(struct inode *dir, */ ret = btrfs_set_inode_index(dir, &index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -591,7 +591,7 @@ static noinline int create_subvol(struct inode *dir, name, namelen, dir, &key, BTRFS_FT_DIR, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -608,7 +608,7 @@ static noinline int create_subvol(struct inode *dir, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); fail: kfree(root_item); @@ -1948,8 +1948,7 @@ static noinline int key_in_sk(struct btrfs_key *key, return 1; } -static noinline int copy_to_sk(struct btrfs_root *root, - struct btrfs_path *path, +static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, struct btrfs_ioctl_search_key *sk, size_t *buf_size, @@ -2120,7 +2119,7 @@ static noinline int search_ioctl(struct inode *inode, ret = 0; goto err; } - ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf, + ret = copy_to_sk(path, &key, sk, buf_size, ubuf, &sk_offset, &num_found); btrfs_release_path(path); if (ret) @@ -2406,7 +2405,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * rmdir(2). */ err = -EPERM; - if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + if (!btrfs_test_opt(root->fs_info, USER_SUBVOL_RM_ALLOWED)) goto out_dput; /* @@ -2489,7 +2488,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dentry->d_name.len); if (ret) { err = ret; - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -2505,7 +2504,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, root->fs_info->tree_root, dest->root_key.objectid); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -2515,7 +2514,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, dest->root_key.objectid); if (ret && ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -2525,7 +2524,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, BTRFS_UUID_KEY_RECEIVED_SUBVOL, dest->root_key.objectid); if (ret && ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -3292,7 +3291,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3694,7 +3693,7 @@ process_slot: if (ret) { if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, - root, ret); + ret); btrfs_end_transaction(trans, root); goto out; } @@ -3702,8 +3701,7 @@ process_slot: ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); if (ret) { - btrfs_abort_transaction(trans, root, - ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3735,7 +3733,6 @@ process_slot: new_key.offset - datao); if (ret) { btrfs_abort_transaction(trans, - root, ret); btrfs_end_transaction(trans, root); @@ -3772,7 +3769,6 @@ process_slot: if (ret) { if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, - root, ret); btrfs_end_transaction(trans, root); goto out; @@ -3828,7 +3824,7 @@ process_slot: last_dest_end, destoff + len, 1); if (ret) { if (ret != -EOPNOTSUPP) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); goto out; } @@ -5164,13 +5160,13 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, BTRFS_UUID_KEY_RECEIVED_SUBVOL, root->root_key.objectid); if (ret < 0 && ret != -EEXIST) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } ret = btrfs_commit_transaction(trans, root); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index aca8264f4a49..3b78d38173b3 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1122,7 +1122,7 @@ int __init ordered_data_init(void) { btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", sizeof(struct btrfs_ordered_extent), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_ordered_extent_cache) return -ENOMEM; diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 36992128c746..cf0b444ac4f3 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -350,6 +350,7 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *parent_root) { + struct super_block *sb = root->fs_info->sb; struct btrfs_key key; struct inode *parent_inode, *child_inode; int ret; @@ -358,12 +359,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - parent_inode = btrfs_iget(parent_root->fs_info->sb, &key, - parent_root, NULL); + parent_inode = btrfs_iget(sb, &key, parent_root, NULL); if (IS_ERR(parent_inode)) return PTR_ERR(parent_inode); - child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + child_inode = btrfs_iget(sb, &key, root, NULL); if (IS_ERR(child_inode)) { iput(parent_inode); return PTR_ERR(child_inode); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9d4c05b14f6e..93ee1c18ef9d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -571,7 +571,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; - if (btrfs_test_is_dummy_root(quota_root)) + if (btrfs_is_testing(quota_root->fs_info)) return 0; path = btrfs_alloc_path(); @@ -728,7 +728,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, int ret; int slot; - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return 0; key.objectid = 0; @@ -1453,9 +1453,10 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, return ret; } -struct btrfs_qgroup_extent_record -*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record) +struct btrfs_qgroup_extent_record * +btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record) { struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; struct rb_node *parent_node = NULL; @@ -1463,7 +1464,7 @@ struct btrfs_qgroup_extent_record u64 bytenr = record->bytenr; assert_spin_locked(&delayed_refs->lock); - trace_btrfs_qgroup_insert_dirty_extent(record); + trace_btrfs_qgroup_insert_dirty_extent(fs_info, record); while (*p) { parent_node = *p; @@ -1595,8 +1596,8 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); - trace_qgroup_update_counters(qg->qgroupid, cur_old_count, - cur_new_count); + trace_qgroup_update_counters(fs_info, qg->qgroupid, + cur_old_count, cur_new_count); /* Rfer update part */ if (cur_old_count == 0 && cur_new_count > 0) { @@ -1687,8 +1688,8 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, goto out_free; BUG_ON(!fs_info->quota_root); - trace_btrfs_qgroup_account_extent(bytenr, num_bytes, nr_old_roots, - nr_new_roots); + trace_btrfs_qgroup_account_extent(fs_info, bytenr, num_bytes, + nr_old_roots, nr_new_roots); qgroups = ulist_alloc(GFP_NOFS); if (!qgroups) { @@ -1759,7 +1760,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, record = rb_entry(node, struct btrfs_qgroup_extent_record, node); - trace_btrfs_qgroup_account_extents(record); + trace_btrfs_qgroup_account_extents(fs_info, record); if (!ret) { /* @@ -2195,7 +2196,7 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) { if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) return; - btrfs_err(trans->root->fs_info, + btrfs_err(trans->fs_info, "qgroups not uptodate in trans handle %p: list is%s empty, " "seq is %#x.%x", trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index ecb2c143ef75..710887c06aaf 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -63,9 +63,10 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -struct btrfs_qgroup_extent_record -*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record); +struct btrfs_qgroup_extent_record * +btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record); int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -88,7 +89,7 @@ static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes) { btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); - trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes); + trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); } void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index fc067b07e31f..b26a5aea41b4 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -235,12 +235,12 @@ static void backref_cache_cleanup(struct backref_cache *cache) cache->last_trans = 0; for (i = 0; i < BTRFS_MAX_LEVEL; i++) - BUG_ON(!list_empty(&cache->pending[i])); - BUG_ON(!list_empty(&cache->changed)); - BUG_ON(!list_empty(&cache->detached)); - BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); - BUG_ON(cache->nr_nodes); - BUG_ON(cache->nr_edges); + ASSERT(list_empty(&cache->pending[i])); + ASSERT(list_empty(&cache->changed)); + ASSERT(list_empty(&cache->detached)); + ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); + ASSERT(!cache->nr_nodes); + ASSERT(!cache->nr_edges); } static struct backref_node *alloc_backref_node(struct backref_cache *cache) @@ -1171,8 +1171,12 @@ out: lower = list_entry(useless.next, struct backref_node, list); list_del_init(&lower->list); + if (lower == node) + node = NULL; free_backref_node(cache, lower); } + + free_backref_node(cache, node); return ERR_PTR(err); } ASSERT(!node || !node->detached); @@ -1719,7 +1723,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, btrfs_header_owner(leaf), key.objectid, key.offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } @@ -1727,7 +1731,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, parent, btrfs_header_owner(leaf), key.objectid, key.offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index f1c30861d062..7fd7e1830cfe 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -150,7 +150,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root ret = btrfs_search_slot(trans, root, key, path, 0, 1); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -176,20 +176,20 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root ret = btrfs_search_slot(trans, root, key, path, -1, 1); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_del_item(trans, root, path); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } l = path->nodes[0]; @@ -448,7 +448,7 @@ again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, sizeof(*ref) + name_len); if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e08b6bc676e3..1d195d2b32c6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3785,27 +3785,27 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (fs_info->scrub_workers_refcnt == 0) { if (is_dev_replace) fs_info->scrub_workers = - btrfs_alloc_workqueue("scrub", flags, + btrfs_alloc_workqueue(fs_info, "scrub", flags, 1, 4); else fs_info->scrub_workers = - btrfs_alloc_workqueue("scrub", flags, + btrfs_alloc_workqueue(fs_info, "scrub", flags, max_active, 4); if (!fs_info->scrub_workers) goto fail_scrub_workers; fs_info->scrub_wr_completion_workers = - btrfs_alloc_workqueue("scrubwrc", flags, + btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, max_active, 2); if (!fs_info->scrub_wr_completion_workers) goto fail_scrub_wr_completion_workers; fs_info->scrub_nocow_workers = - btrfs_alloc_workqueue("scrubnc", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0); if (!fs_info->scrub_nocow_workers) goto fail_scrub_nocow_workers; fs_info->scrub_parity_workers = - btrfs_alloc_workqueue("scrubparity", flags, + btrfs_alloc_workqueue(fs_info, "scrubparity", flags, max_active, 2); if (!fs_info->scrub_parity_workers) goto fail_scrub_parity_workers; @@ -3860,7 +3860,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { /* not supported for data w/o checksums */ - btrfs_err(fs_info, + btrfs_err_rl(fs_info, "scrub: size assumption sectorsize != PAGE_SIZE " "(%d != %lu) fails", fs_info->chunk_root->sectorsize, PAGE_SIZE); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 60e7179ed4b7..864ce334f696 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -184,6 +184,22 @@ static const char * const logtypes[] = { "debug", }; + +/* + * Use one ratelimit state per log level so that a flood of less important + * messages doesn't cause more important ones to be dropped. + */ +static struct ratelimit_state printk_limits[] = { + RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), +}; + void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { struct super_block *sb = fs_info->sb; @@ -192,6 +208,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) va_list args; const char *type = logtypes[4]; int kern_level; + struct ratelimit_state *ratelimit; va_start(args, fmt); @@ -202,13 +219,18 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) lvl[size] = '\0'; fmt += size; type = logtypes[kern_level - '0']; - } else + ratelimit = &printk_limits[kern_level - '0']; + } else { *lvl = '\0'; + /* Default to debug output */ + ratelimit = &printk_limits[7]; + } vaf.fmt = fmt; vaf.va = &args; - printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); + if (__ratelimit(ratelimit)) + printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); va_end(args); } @@ -229,9 +251,11 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) */ __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *function, + const char *function, unsigned int line, int errno) { + struct btrfs_fs_info *fs_info = trans->fs_info; + trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ @@ -239,16 +263,16 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *errstr; errstr = btrfs_decode_error(errno); - btrfs_warn(root->fs_info, + btrfs_warn(fs_info, "%s:%d: Aborting unused transaction(%s).", function, line, errstr); return; } ACCESS_ONCE(trans->transaction->aborted) = errno; /* Wake up anybody who may be waiting on this transaction */ - wake_up(&root->fs_info->transaction_wait); - wake_up(&root->fs_info->transaction_blocked_wait); - __btrfs_handle_fs_error(root->fs_info, function, line, errno, NULL); + wake_up(&fs_info->transaction_wait); + wake_up(&fs_info->transaction_blocked_wait); + __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); } /* * __btrfs_panic decodes unexpected, fatal errors from the caller, @@ -432,12 +456,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, */ break; case Opt_nodatasum: - btrfs_set_and_info(root, NODATASUM, + btrfs_set_and_info(info, NODATASUM, "setting nodatasum"); break; case Opt_datasum: - if (btrfs_test_opt(root, NODATASUM)) { - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(info, NODATASUM)) { + if (btrfs_test_opt(info, NODATACOW)) btrfs_info(root->fs_info, "setting datasum, datacow enabled"); else btrfs_info(root->fs_info, "setting datasum"); @@ -446,9 +470,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_clear_opt(info->mount_opt, NODATASUM); break; case Opt_nodatacow: - if (!btrfs_test_opt(root, NODATACOW)) { - if (!btrfs_test_opt(root, COMPRESS) || - !btrfs_test_opt(root, FORCE_COMPRESS)) { + if (!btrfs_test_opt(info, NODATACOW)) { + if (!btrfs_test_opt(info, COMPRESS) || + !btrfs_test_opt(info, FORCE_COMPRESS)) { btrfs_info(root->fs_info, "setting nodatacow, compression disabled"); } else { @@ -461,7 +485,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_set_opt(info->mount_opt, NODATASUM); break; case Opt_datacow: - btrfs_clear_and_info(root, NODATACOW, + btrfs_clear_and_info(info, NODATACOW, "setting datacow"); break; case Opt_compress_force: @@ -470,10 +494,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, /* Fallthrough */ case Opt_compress: case Opt_compress_type: - saved_compress_type = btrfs_test_opt(root, COMPRESS) ? + saved_compress_type = btrfs_test_opt(info, + COMPRESS) ? info->compress_type : BTRFS_COMPRESS_NONE; saved_compress_force = - btrfs_test_opt(root, FORCE_COMPRESS); + btrfs_test_opt(info, FORCE_COMPRESS); if (token == Opt_compress || token == Opt_compress_force || strcmp(args[0].from, "zlib") == 0) { @@ -513,10 +538,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, */ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); } - if ((btrfs_test_opt(root, COMPRESS) && + if ((btrfs_test_opt(info, COMPRESS) && (info->compress_type != saved_compress_type || compress_force != saved_compress_force)) || - (!btrfs_test_opt(root, COMPRESS) && + (!btrfs_test_opt(info, COMPRESS) && no_compress == 1)) { btrfs_info(root->fs_info, "%s %s compression", @@ -526,25 +551,25 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, compress_force = false; break; case Opt_ssd: - btrfs_set_and_info(root, SSD, + btrfs_set_and_info(info, SSD, "use ssd allocation scheme"); break; case Opt_ssd_spread: - btrfs_set_and_info(root, SSD_SPREAD, + btrfs_set_and_info(info, SSD_SPREAD, "use spread ssd allocation scheme"); btrfs_set_opt(info->mount_opt, SSD); break; case Opt_nossd: - btrfs_set_and_info(root, NOSSD, + btrfs_set_and_info(info, NOSSD, "not using ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, SSD); break; case Opt_barrier: - btrfs_clear_and_info(root, NOBARRIER, + btrfs_clear_and_info(info, NOBARRIER, "turning on barriers"); break; case Opt_nobarrier: - btrfs_set_and_info(root, NOBARRIER, + btrfs_set_and_info(info, NOBARRIER, "turning off barriers"); break; case Opt_thread_pool: @@ -604,24 +629,24 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, root->fs_info->sb->s_flags &= ~MS_POSIXACL; break; case Opt_notreelog: - btrfs_set_and_info(root, NOTREELOG, + btrfs_set_and_info(info, NOTREELOG, "disabling tree log"); break; case Opt_treelog: - btrfs_clear_and_info(root, NOTREELOG, + btrfs_clear_and_info(info, NOTREELOG, "enabling tree log"); break; case Opt_norecovery: case Opt_nologreplay: - btrfs_set_and_info(root, NOLOGREPLAY, + btrfs_set_and_info(info, NOLOGREPLAY, "disabling log replay at mount time"); break; case Opt_flushoncommit: - btrfs_set_and_info(root, FLUSHONCOMMIT, + btrfs_set_and_info(info, FLUSHONCOMMIT, "turning on flush-on-commit"); break; case Opt_noflushoncommit: - btrfs_clear_and_info(root, FLUSHONCOMMIT, + btrfs_clear_and_info(info, FLUSHONCOMMIT, "turning off flush-on-commit"); break; case Opt_ratio: @@ -638,11 +663,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, } break; case Opt_discard: - btrfs_set_and_info(root, DISCARD, + btrfs_set_and_info(info, DISCARD, "turning on discard"); break; case Opt_nodiscard: - btrfs_clear_and_info(root, DISCARD, + btrfs_clear_and_info(info, DISCARD, "turning off discard"); break; case Opt_space_cache: @@ -651,12 +676,13 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, strcmp(args[0].from, "v1") == 0) { btrfs_clear_opt(root->fs_info->mount_opt, FREE_SPACE_TREE); - btrfs_set_and_info(root, SPACE_CACHE, + btrfs_set_and_info(info, SPACE_CACHE, "enabling disk space caching"); } else if (strcmp(args[0].from, "v2") == 0) { btrfs_clear_opt(root->fs_info->mount_opt, SPACE_CACHE); - btrfs_set_and_info(root, FREE_SPACE_TREE, + btrfs_set_and_info(info, + FREE_SPACE_TREE, "enabling free space tree"); } else { ret = -EINVAL; @@ -667,12 +693,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); break; case Opt_no_space_cache: - if (btrfs_test_opt(root, SPACE_CACHE)) { - btrfs_clear_and_info(root, SPACE_CACHE, + if (btrfs_test_opt(info, SPACE_CACHE)) { + btrfs_clear_and_info(info, + SPACE_CACHE, "disabling disk space caching"); } - if (btrfs_test_opt(root, FREE_SPACE_TREE)) { - btrfs_clear_and_info(root, FREE_SPACE_TREE, + if (btrfs_test_opt(info, FREE_SPACE_TREE)) { + btrfs_clear_and_info(info, + FREE_SPACE_TREE, "disabling free space tree"); } break; @@ -685,7 +713,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, "disabling inode map caching"); break; case Opt_clear_cache: - btrfs_set_and_info(root, CLEAR_CACHE, + btrfs_set_and_info(info, CLEAR_CACHE, "force clearing of disk cache"); break; case Opt_user_subvol_rm_allowed: @@ -698,11 +726,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG); break; case Opt_defrag: - btrfs_set_and_info(root, AUTO_DEFRAG, + btrfs_set_and_info(info, AUTO_DEFRAG, "enabling auto defrag"); break; case Opt_nodefrag: - btrfs_clear_and_info(root, AUTO_DEFRAG, + btrfs_clear_and_info(info, AUTO_DEFRAG, "disabling auto defrag"); break; case Opt_recovery: @@ -810,22 +838,22 @@ check: /* * Extra check for current option against current flag */ - if (btrfs_test_opt(root, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) { + if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) { btrfs_err(root->fs_info, "nologreplay must be used with ro mount option"); ret = -EINVAL; } out: if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) && - !btrfs_test_opt(root, FREE_SPACE_TREE) && - !btrfs_test_opt(root, CLEAR_CACHE)) { + !btrfs_test_opt(info, FREE_SPACE_TREE) && + !btrfs_test_opt(info, CLEAR_CACHE)) { btrfs_err(root->fs_info, "cannot disable free space tree"); ret = -EINVAL; } - if (!ret && btrfs_test_opt(root, SPACE_CACHE)) + if (!ret && btrfs_test_opt(info, SPACE_CACHE)) btrfs_info(root->fs_info, "disk space caching is enabled"); - if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE)) + if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE)) btrfs_info(root->fs_info, "using free space tree"); kfree(orig); return ret; @@ -1149,7 +1177,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - trace_btrfs_sync_fs(wait); + trace_btrfs_sync_fs(fs_info, wait); if (!wait) { filemap_flush(fs_info->btree_inode->i_mapping); @@ -1192,13 +1220,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) struct btrfs_root *root = info->tree_root; char *compress_type; - if (btrfs_test_opt(root, DEGRADED)) + if (btrfs_test_opt(info, DEGRADED)) seq_puts(seq, ",degraded"); - if (btrfs_test_opt(root, NODATASUM)) + if (btrfs_test_opt(info, NODATASUM)) seq_puts(seq, ",nodatasum"); - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(info, NODATACOW)) seq_puts(seq, ",nodatacow"); - if (btrfs_test_opt(root, NOBARRIER)) + if (btrfs_test_opt(info, NOBARRIER)) seq_puts(seq, ",nobarrier"); if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) seq_printf(seq, ",max_inline=%llu", info->max_inline); @@ -1207,56 +1235,56 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); - if (btrfs_test_opt(root, COMPRESS)) { + if (btrfs_test_opt(info, COMPRESS)) { if (info->compress_type == BTRFS_COMPRESS_ZLIB) compress_type = "zlib"; else compress_type = "lzo"; - if (btrfs_test_opt(root, FORCE_COMPRESS)) + if (btrfs_test_opt(info, FORCE_COMPRESS)) seq_printf(seq, ",compress-force=%s", compress_type); else seq_printf(seq, ",compress=%s", compress_type); } - if (btrfs_test_opt(root, NOSSD)) + if (btrfs_test_opt(info, NOSSD)) seq_puts(seq, ",nossd"); - if (btrfs_test_opt(root, SSD_SPREAD)) + if (btrfs_test_opt(info, SSD_SPREAD)) seq_puts(seq, ",ssd_spread"); - else if (btrfs_test_opt(root, SSD)) + else if (btrfs_test_opt(info, SSD)) seq_puts(seq, ",ssd"); - if (btrfs_test_opt(root, NOTREELOG)) + if (btrfs_test_opt(info, NOTREELOG)) seq_puts(seq, ",notreelog"); - if (btrfs_test_opt(root, NOLOGREPLAY)) + if (btrfs_test_opt(info, NOLOGREPLAY)) seq_puts(seq, ",nologreplay"); - if (btrfs_test_opt(root, FLUSHONCOMMIT)) + if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); - if (btrfs_test_opt(root, DISCARD)) + if (btrfs_test_opt(info, DISCARD)) seq_puts(seq, ",discard"); if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) seq_puts(seq, ",noacl"); - if (btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(info, SPACE_CACHE)) seq_puts(seq, ",space_cache"); - else if (btrfs_test_opt(root, FREE_SPACE_TREE)) + else if (btrfs_test_opt(info, FREE_SPACE_TREE)) seq_puts(seq, ",space_cache=v2"); else seq_puts(seq, ",nospace_cache"); - if (btrfs_test_opt(root, RESCAN_UUID_TREE)) + if (btrfs_test_opt(info, RESCAN_UUID_TREE)) seq_puts(seq, ",rescan_uuid_tree"); - if (btrfs_test_opt(root, CLEAR_CACHE)) + if (btrfs_test_opt(info, CLEAR_CACHE)) seq_puts(seq, ",clear_cache"); - if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + if (btrfs_test_opt(info, USER_SUBVOL_RM_ALLOWED)) seq_puts(seq, ",user_subvol_rm_allowed"); - if (btrfs_test_opt(root, ENOSPC_DEBUG)) + if (btrfs_test_opt(info, ENOSPC_DEBUG)) seq_puts(seq, ",enospc_debug"); - if (btrfs_test_opt(root, AUTO_DEFRAG)) + if (btrfs_test_opt(info, AUTO_DEFRAG)) seq_puts(seq, ",autodefrag"); - if (btrfs_test_opt(root, INODE_MAP_CACHE)) + if (btrfs_test_opt(info, INODE_MAP_CACHE)) seq_puts(seq, ",inode_cache"); - if (btrfs_test_opt(root, SKIP_BALANCE)) + if (btrfs_test_opt(info, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) + if (btrfs_test_opt(info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) seq_puts(seq, ",check_int_data"); - else if (btrfs_test_opt(root, CHECK_INTEGRITY)) + else if (btrfs_test_opt(info, CHECK_INTEGRITY)) seq_puts(seq, ",check_int"); if (info->check_integrity_print_mask) seq_printf(seq, ",check_int_print_mask=%d", @@ -1265,14 +1293,14 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (info->metadata_ratio) seq_printf(seq, ",metadata_ratio=%d", info->metadata_ratio); - if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR)) + if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR)) seq_puts(seq, ",fatal_errors=panic"); if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) seq_printf(seq, ",commit=%d", info->commit_interval); #ifdef CONFIG_BTRFS_DEBUG - if (btrfs_test_opt(root, FRAGMENT_DATA)) + if (btrfs_test_opt(info, FRAGMENT_DATA)) seq_puts(seq, ",fragment=data"); - if (btrfs_test_opt(root, FRAGMENT_METADATA)) + if (btrfs_test_opt(info, FRAGMENT_METADATA)) seq_puts(seq, ",fragment=metadata"); #endif seq_printf(seq, ",subvolid=%llu", @@ -2030,9 +2058,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) * chunk). * * If metadata is exhausted, f_bavail will be 0. - * - * FIXME: not accurate for mixed block groups, total and free/used are ok, - * available appears slightly larger. */ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { @@ -2319,49 +2344,6 @@ static void btrfs_print_mod_info(void) btrfs_crc32c_impl()); } -static int btrfs_run_sanity_tests(void) -{ - int ret, i; - u32 sectorsize, nodesize; - u32 test_sectorsize[] = { - PAGE_SIZE, - }; - ret = btrfs_init_test_fs(); - if (ret) - return ret; - for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) { - sectorsize = test_sectorsize[i]; - for (nodesize = sectorsize; - nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE; - nodesize <<= 1) { - pr_info("BTRFS: selftest: sectorsize: %u nodesize: %u\n", - sectorsize, nodesize); - ret = btrfs_test_free_space_cache(sectorsize, nodesize); - if (ret) - goto out; - ret = btrfs_test_extent_buffer_operations(sectorsize, - nodesize); - if (ret) - goto out; - ret = btrfs_test_extent_io(sectorsize, nodesize); - if (ret) - goto out; - ret = btrfs_test_inodes(sectorsize, nodesize); - if (ret) - goto out; - ret = btrfs_test_qgroups(sectorsize, nodesize); - if (ret) - goto out; - ret = btrfs_test_free_space_tree(sectorsize, nodesize); - if (ret) - goto out; - } - } -out: - btrfs_destroy_test_fs(); - return ret; -} - static int __init init_btrfs_fs(void) { int err; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 4879656bda3c..c6569905d3d1 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -326,6 +326,7 @@ SPACE_INFO_ATTR(bytes_used); SPACE_INFO_ATTR(bytes_pinned); SPACE_INFO_ATTR(bytes_reserved); SPACE_INFO_ATTR(bytes_may_use); +SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned); @@ -337,6 +338,7 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(bytes_pinned), BTRFS_ATTR_PTR(bytes_reserved), BTRFS_ATTR_PTR(bytes_may_use), + BTRFS_ATTR_PTR(bytes_readonly), BTRFS_ATTR_PTR(disk_used), BTRFS_ATTR_PTR(disk_total), BTRFS_ATTR_PTR(total_bytes_pinned), diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 02223f3f78f4..bf62ad919a95 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -54,7 +54,7 @@ struct inode *btrfs_new_test_inode(void) return new_inode(test_mnt->mnt_sb); } -int btrfs_init_test_fs(void) +static int btrfs_init_test_fs(void) { int ret; @@ -73,7 +73,7 @@ int btrfs_init_test_fs(void) return 0; } -void btrfs_destroy_test_fs(void) +static void btrfs_destroy_test_fs(void) { kern_unmount(test_mnt); unregister_filesystem(&test_type); @@ -128,14 +128,27 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) extent_io_tree_init(&fs_info->freed_extents[0], NULL); extent_io_tree_init(&fs_info->freed_extents[1], NULL); fs_info->pinned_extents = &fs_info->freed_extents[0]; + set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + + test_mnt->mnt_sb->s_fs_info = fs_info; + return fs_info; } -static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) +void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { struct radix_tree_iter iter; void **slot; + if (!fs_info) + return; + + if (WARN_ON(!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, + &fs_info->fs_state))) + return; + + test_mnt->mnt_sb->s_fs_info = NULL; + spin_lock(&fs_info->buffer_lock); radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { struct extent_buffer *eb; @@ -167,10 +180,11 @@ void btrfs_free_dummy_root(struct btrfs_root *root) { if (!root) return; + /* Will be freed by btrfs_free_fs_roots */ + if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) + return; if (root->node) free_extent_buffer(root->node); - if (root->fs_info) - btrfs_free_dummy_fs_info(root->fs_info); kfree(root); } @@ -220,3 +234,46 @@ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans) INIT_LIST_HEAD(&trans->qgroup_ref_list); trans->type = __TRANS_DUMMY; } + +int btrfs_run_sanity_tests(void) +{ + int ret, i; + u32 sectorsize, nodesize; + u32 test_sectorsize[] = { + PAGE_SIZE, + }; + ret = btrfs_init_test_fs(); + if (ret) + return ret; + for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) { + sectorsize = test_sectorsize[i]; + for (nodesize = sectorsize; + nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE; + nodesize <<= 1) { + pr_info("BTRFS: selftest: sectorsize: %u nodesize: %u\n", + sectorsize, nodesize); + ret = btrfs_test_free_space_cache(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_extent_buffer_operations(sectorsize, + nodesize); + if (ret) + goto out; + ret = btrfs_test_extent_io(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_inodes(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_qgroups(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_free_space_tree(sectorsize, nodesize); + if (ret) + goto out; + } + } +out: + btrfs_destroy_test_fs(); + return ret; +} diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 66fb6b701eb7..b17ffbe8f9f3 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -20,57 +20,29 @@ #define __BTRFS_TESTS #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_run_sanity_tests(void); #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) struct btrfs_root; struct btrfs_trans_handle; -int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize); int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize); +int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize); int btrfs_test_extent_io(u32 sectorsize, u32 nodesize); int btrfs_test_inodes(u32 sectorsize, u32 nodesize); int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); -int btrfs_init_test_fs(void); -void btrfs_destroy_test_fs(void); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); +void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); void btrfs_free_dummy_root(struct btrfs_root *root); struct btrfs_block_group_cache * btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize); void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans); #else -static inline int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) -{ - return 0; -} -static inline int btrfs_test_extent_buffer_operations(u32 sectorsize, - u32 nodesize) -{ - return 0; -} -static inline int btrfs_init_test_fs(void) -{ - return 0; -} -static inline void btrfs_destroy_test_fs(void) -{ -} -static inline int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) -{ - return 0; -} -static inline int btrfs_test_inodes(u32 sectorsize, u32 nodesize) -{ - return 0; -} -static inline int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) -{ - return 0; -} -static inline int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) +static inline int btrfs_run_sanity_tests(void) { return 0; } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index 4f8cbd1ec5ee..199569174637 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -24,8 +24,9 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) { - struct btrfs_path *path; - struct btrfs_root *root; + struct btrfs_fs_info *fs_info; + struct btrfs_path *path = NULL; + struct btrfs_root *root = NULL; struct extent_buffer *eb; struct btrfs_item *item; char *value = "mary had a little lamb"; @@ -40,17 +41,24 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) test_msg("Running btrfs_split_item tests\n"); - root = btrfs_alloc_dummy_root(sectorsize, nodesize); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Could not allocate fs_info\n"); + return -ENOMEM; + } + + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Could not allocate root\n"); - return PTR_ERR(root); + ret = PTR_ERR(root); + goto out; } path = btrfs_alloc_path(); if (!path) { test_msg("Could not allocate path\n"); - kfree(root); - return -ENOMEM; + ret = -ENOMEM; + goto out; } path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, nodesize, @@ -219,7 +227,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) } out: btrfs_free_path(path); - kfree(root); + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 3956bb2ff84c..3221c8dee272 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -837,6 +837,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info; struct btrfs_block_group_cache *cache; struct btrfs_root *root = NULL; int ret = -ENOMEM; @@ -855,15 +856,17 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) return 0; } - root = btrfs_alloc_dummy_root(sectorsize, nodesize); - if (IS_ERR(root)) { - ret = PTR_ERR(root); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + ret = -ENOMEM; goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + ret = PTR_ERR(root); goto out; + } root->fs_info->extent_root = root; cache->fs_info = root->fs_info; @@ -882,6 +885,7 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) out: btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); test_msg("Free space cache tests finished\n"); return ret; } diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index aac507085ab0..7508d3b42780 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -443,23 +443,24 @@ typedef int (*test_func_t)(struct btrfs_trans_handle *, static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info; struct btrfs_root *root = NULL; struct btrfs_block_group_cache *cache = NULL; struct btrfs_trans_handle trans; struct btrfs_path *path = NULL; int ret; - root = btrfs_alloc_dummy_root(sectorsize, nodesize); - if (IS_ERR(root)) { - test_msg("Couldn't allocate dummy root\n"); - ret = PTR_ERR(root); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + ret = -ENOMEM; goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); - ret = -ENOMEM; + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate dummy root\n"); + ret = PTR_ERR(root); goto out; } @@ -534,6 +535,7 @@ out: btrfs_free_path(path); btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 29648c0a39f1..9f72aeda9220 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -230,6 +230,7 @@ static unsigned long vacancy_only = 0; static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct inode *inode = NULL; struct btrfs_root *root = NULL; struct extent_map *em = NULL; @@ -248,19 +249,15 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - root = btrfs_alloc_dummy_root(sectorsize, nodesize); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); goto out; } - /* - * We do this since btrfs_get_extent wants to assign em->bdev to - * root->fs_info->fs_devices->latest_bdev. - */ - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); goto out; } @@ -835,11 +832,13 @@ out: free_extent_map(em); iput(inode); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } static int test_hole_first(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct inode *inode = NULL; struct btrfs_root *root = NULL; struct extent_map *em = NULL; @@ -855,15 +854,15 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - root = btrfs_alloc_dummy_root(sectorsize, nodesize); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); goto out; } @@ -934,11 +933,13 @@ out: free_extent_map(em); iput(inode); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } static int test_extent_accounting(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct inode *inode = NULL; struct btrfs_root *root = NULL; int ret = -ENOMEM; @@ -949,15 +950,15 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) return ret; } - root = btrfs_alloc_dummy_root(sectorsize, nodesize); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); goto out; } @@ -1132,6 +1133,7 @@ out: NULL, GFP_KERNEL); iput(inode); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 57a12c0d680b..4407fef7c16c 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -453,22 +453,24 @@ static int test_multiple_refs(struct btrfs_root *root, int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct btrfs_root *root; struct btrfs_root *tmp_root; int ret = 0; - root = btrfs_alloc_dummy_root(sectorsize, nodesize); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); - return PTR_ERR(root); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + return -ENOMEM; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); - ret = -ENOMEM; + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + ret = PTR_ERR(root); goto out; } + /* We are using this root as our extent root */ root->fs_info->extent_root = root; @@ -495,7 +497,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) btrfs_set_header_nritems(root->node, 0); root->alloc_bytenr += 2 * nodesize; - tmp_root = btrfs_alloc_dummy_root(sectorsize, nodesize); + tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); if (IS_ERR(tmp_root)) { test_msg("Couldn't allocate a fs root\n"); ret = PTR_ERR(tmp_root); @@ -510,7 +512,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) goto out; } - tmp_root = btrfs_alloc_dummy_root(sectorsize, nodesize); + tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); if (IS_ERR(tmp_root)) { test_msg("Couldn't allocate a fs root\n"); ret = PTR_ERR(tmp_root); @@ -531,5 +533,6 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) ret = test_multiple_refs(root, sectorsize, nodesize); out: btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 948aa186b353..9cca0a721961 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -561,6 +561,7 @@ again: h->transaction = cur_trans; h->root = root; h->use_count = 1; + h->fs_info = root->fs_info; h->type = type; h->can_flush_pending_bgs = true; @@ -1491,7 +1492,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto dir_item_existed; } else if (IS_ERR(dir_item)) { ret = PTR_ERR(dir_item); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } btrfs_release_path(path); @@ -1504,7 +1505,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, */ ret = btrfs_run_delayed_items(trans, root); if (ret) { /* Transaction aborted */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1543,7 +1544,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) { btrfs_tree_unlock(old); free_extent_buffer(old); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1554,7 +1555,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(old); free_extent_buffer(old); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } /* see comments in should_cow_block() */ @@ -1568,7 +1569,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(tmp); free_extent_buffer(tmp); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1580,7 +1581,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_ino(parent_inode), index, dentry->d_name.name, dentry->d_name.len); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1588,19 +1589,19 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_reloc_post_snapshot(trans, pending); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1622,7 +1623,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1632,13 +1633,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, current_fs_time(parent_inode->i_sb); ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) { @@ -1647,14 +1648,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, BTRFS_UUID_KEY_RECEIVED_SUBVOL, objectid); if (ret && ret != -EEXIST) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } } ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1709,7 +1710,7 @@ static void update_super_roots(struct btrfs_root *root) super->root = root_item->bytenr; super->generation = root_item->generation; super->root_level = root_item->level; - if (btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(root->fs_info, SPACE_CACHE)) super->cache_generation = root_item->generation; if (root->fs_info->update_uuid_tree_gen) super->uuid_tree_generation = root_item->generation; @@ -1850,7 +1851,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, WARN_ON(trans->use_count > 1); - btrfs_abort_transaction(trans, root, err); + btrfs_abort_transaction(trans, err); spin_lock(&root->fs_info->trans_lock); @@ -1895,14 +1896,14 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { - if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) return btrfs_start_delalloc_roots(fs_info, 1, -1); return 0; } static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { - if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index c5abee4f01ad..efb122643380 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -128,6 +128,7 @@ struct btrfs_trans_handle { * Subvolume quota depends on this */ struct btrfs_root *root; + struct btrfs_fs_info *fs_info; struct seq_list delayed_ref_elem; struct list_head qgroup_ref_list; struct list_head new_bgs; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c05f69a8ec42..d31a0c4f56be 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2757,7 +2757,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, while (1) { int batch = atomic_read(&root->log_batch); /* when we're on an ssd, just kick the log commit out */ - if (!btrfs_test_opt(root, SSD) && + if (!btrfs_test_opt(root->fs_info, SSD) && test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); @@ -2788,7 +2788,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); if (ret) { blk_finish_plug(&plug); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_free_logged_extents(log, log_transid); btrfs_set_log_full_commit(root->fs_info, trans); mutex_unlock(&root->log_mutex); @@ -2838,7 +2838,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_log_full_commit(root->fs_info, trans); if (ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); mutex_unlock(&log_root_tree->log_mutex); goto out; } @@ -2898,7 +2898,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); if (ret) { btrfs_set_log_full_commit(root->fs_info, trans); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; @@ -2934,7 +2934,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = write_ctree_super(trans, root->fs_info->tree_root, 1); if (ret) { btrfs_set_log_full_commit(root->fs_info, trans); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_wake_log_root; } @@ -2991,7 +2991,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, ret = walk_log_tree(trans, log, &wc); /* I don't think this can happen but just in case */ if (ret) - btrfs_abort_transaction(trans, log, ret); + btrfs_abort_transaction(trans, ret); while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, @@ -3160,7 +3160,7 @@ out_unlock: btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_log_trans(root); @@ -3193,7 +3193,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0 && ret != -ENOENT) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_log_trans(root); return ret; @@ -4703,6 +4703,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ins_nr = 0; ret = btrfs_search_forward(root, &min_key, path, trans->transid); + if (ret < 0) { + err = ret; + goto out_unlock; + } if (ret != 0) break; again: @@ -5301,7 +5305,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, sb = inode->i_sb; - if (btrfs_test_opt(root, NOTREELOG)) { + if (btrfs_test_opt(root->fs_info, NOTREELOG)) { ret = 1; goto end_no_trans; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0fb4a959012e..bb0addce7558 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -140,7 +140,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root); static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); -static void btrfs_close_one_device(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -853,6 +852,46 @@ static void free_device(struct rcu_head *head) schedule_work(&device->rcu_work); } +static void btrfs_close_one_device(struct btrfs_device *device) +{ + struct btrfs_fs_devices *fs_devices = device->fs_devices; + struct btrfs_device *new_device; + struct rcu_string *name; + + if (device->bdev) + fs_devices->open_devices--; + + if (device->writeable && + device->devid != BTRFS_DEV_REPLACE_DEVID) { + list_del_init(&device->dev_alloc_list); + fs_devices->rw_devices--; + } + + if (device->missing) + fs_devices->missing_devices--; + + if (device->bdev && device->writeable) { + sync_blockdev(device->bdev); + invalidate_bdev(device->bdev); + } + + new_device = btrfs_alloc_device(NULL, &device->devid, + device->uuid); + BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ + + /* Safe because we are under uuid_mutex */ + if (device->name) { + name = rcu_string_strdup(device->name->str, GFP_NOFS); + BUG_ON(!name); /* -ENOMEM */ + rcu_assign_pointer(new_device->name, name); + } + + list_replace_rcu(&device->dev_list, &new_device->dev_list); + new_device->fs_devices = device->fs_devices; + + call_rcu(&device->rcu, free_device); +} + static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; @@ -2399,14 +2438,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = init_first_rw_device(trans, root, device); unlock_chunks(root); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto error_trans; } } ret = btrfs_add_device(trans, root, device); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto error_trans; } @@ -2415,7 +2454,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_finish_sprout(trans, root); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto error_trans; } @@ -2801,7 +2840,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, &dev_extent_len); if (ret) { mutex_unlock(&fs_devices->device_list_mutex); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2820,7 +2859,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, ret = btrfs_update_device(trans, map->stripes[i].dev); if (ret) { mutex_unlock(&fs_devices->device_list_mutex); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -2829,7 +2868,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2838,14 +2877,14 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2902,7 +2941,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) * chunk tree entries */ ret = btrfs_remove_chunk(trans, root, chunk_offset); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans, extent_root); return ret; } @@ -3421,7 +3460,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u64 size_to_free; u64 chunk_type; struct btrfs_chunk *chunk; - struct btrfs_path *path; + struct btrfs_path *path = NULL; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_trans_handle *trans; @@ -3455,13 +3494,33 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) ret = btrfs_shrink_device(device, old_size - size_to_free); if (ret == -ENOSPC) break; - BUG_ON(ret); + if (ret) { + /* btrfs_shrink_device never returns ret > 0 */ + WARN_ON(ret > 0); + goto error; + } trans = btrfs_start_transaction(dev_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_info_in_rcu(fs_info, + "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", + rcu_str_deref(device->name), ret, + old_size, old_size - size_to_free); + goto error; + } ret = btrfs_grow_device(trans, device, old_size); - BUG_ON(ret); + if (ret) { + btrfs_end_transaction(trans, dev_root); + /* btrfs_grow_device never returns ret > 0 */ + WARN_ON(ret > 0); + btrfs_info_in_rcu(fs_info, + "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", + rcu_str_deref(device->name), ret, + old_size, old_size - size_to_free); + goto error; + } btrfs_end_transaction(trans, dev_root); } @@ -3885,7 +3944,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->balance_lock); - if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { + if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { btrfs_info(fs_info, "force skipping balance"); return 0; } @@ -4240,7 +4299,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) BTRFS_UUID_TREE_OBJECTID); if (IS_ERR(uuid_root)) { ret = PTR_ERR(uuid_root); - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, tree_root); return ret; } @@ -4514,8 +4573,7 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID56); } -#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \ - - sizeof(struct btrfs_item) \ +#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r) \ - sizeof(struct btrfs_chunk)) \ / sizeof(struct btrfs_stripe) + 1) @@ -6401,7 +6459,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, BTRFS_UUID_SIZE); map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, uuid, NULL); - if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { + if (!map->stripes[i].dev && + !btrfs_test_opt(root->fs_info, DEGRADED)) { free_extent_map(em); return -EIO; } @@ -6469,7 +6528,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root, fs_devices = find_fsid(fsid); if (!fs_devices) { - if (!btrfs_test_opt(root, DEGRADED)) + if (!btrfs_test_opt(root->fs_info, DEGRADED)) return ERR_PTR(-ENOENT); fs_devices = alloc_fs_devices(fsid); @@ -6531,7 +6590,7 @@ static int read_one_dev(struct btrfs_root *root, device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); if (!device) { - if (!btrfs_test_opt(root, DEGRADED)) + if (!btrfs_test_opt(root->fs_info, DEGRADED)) return -EIO; device = add_missing_dev(root, fs_devices, devid, dev_uuid); @@ -6540,7 +6599,7 @@ static int read_one_dev(struct btrfs_root *root, btrfs_warn(root->fs_info, "devid %llu uuid %pU missing", devid, dev_uuid); } else { - if (!device->bdev && !btrfs_test_opt(root, DEGRADED)) + if (!device->bdev && !btrfs_test_opt(root->fs_info, DEGRADED)) return -EIO; if(!device->bdev && !device->missing) { @@ -7143,38 +7202,3 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) fs_devices = fs_devices->seed; } } - -static void btrfs_close_one_device(struct btrfs_device *device) -{ - struct btrfs_fs_devices *fs_devices = device->fs_devices; - struct btrfs_device *new_device; - struct rcu_string *name; - - if (device->bdev) - fs_devices->open_devices--; - - if (device->writeable && - device->devid != BTRFS_DEV_REPLACE_DEVID) { - list_del_init(&device->dev_alloc_list); - fs_devices->rw_devices--; - } - - if (device->missing) - fs_devices->missing_devices--; - - new_device = btrfs_alloc_device(NULL, &device->devid, - device->uuid); - BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ - - /* Safe because we are under uuid_mutex */ - if (device->name) { - name = rcu_string_strdup(device->name->str, GFP_NOFS); - BUG_ON(!name); /* -ENOMEM */ - rcu_assign_pointer(new_device->name, name); - } - - list_replace_rcu(&device->dev_list, &new_device->dev_list); - new_device->fs_devices = device->fs_devices; - - call_rcu(&device->rcu, free_device); -} diff --git a/fs/exec.c b/fs/exec.c index a1789cd684bf..6fcfb3f7b137 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -762,6 +762,39 @@ out_unlock: } EXPORT_SYMBOL(setup_arg_pages); +#else + +/* + * Transfer the program arguments and environment from the holding pages + * onto the stack. The provided stack pointer is adjusted accordingly. + */ +int transfer_args_to_stack(struct linux_binprm *bprm, + unsigned long *sp_location) +{ + unsigned long index, stop, sp; + int ret = 0; + + stop = bprm->p >> PAGE_SHIFT; + sp = *sp_location; + + for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { + unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0; + char *src = kmap(bprm->page[index]) + offset; + sp -= PAGE_SIZE - offset; + if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0) + ret = -EFAULT; + kunmap(bprm->page[index]); + if (ret) + goto out; + } + + *sp_location = sp; + +out: + return ret; +} +EXPORT_SYMBOL(transfer_args_to_stack); + #endif /* CONFIG_MMU */ static struct file *do_open_execat(int fd, struct filename *name, int flags) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 5c57654927a6..90e46cd752fe 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -959,10 +959,11 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) if (S_ISLNK(root_inode->i_mode)) { char *name = follow_link(host_root_path); - if (IS_ERR(name)) + if (IS_ERR(name)) { err = PTR_ERR(name); - else - err = read_name(root_inode, name); + goto out_put; + } + err = read_name(root_inode, name); kfree(name); if (err) goto out_put; diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index c9f583d7bac8..47febcf99185 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -90,6 +90,7 @@ config NFSD_BLOCKLAYOUT bool "NFSv4.1 server support for pNFS block layouts" depends on NFSD_V4 && BLOCK select NFSD_PNFS + select EXPORTFS_BLOCK_OPS help This option enables support for the exporting pNFS block layouts in the kernel's NFS server. The pNFS block layout enables NFS @@ -102,6 +103,7 @@ config NFSD_SCSILAYOUT bool "NFSv4.1 server support for pNFS SCSI layouts" depends on NFSD_V4 && BLOCK select NFSD_PNFS + select EXPORTFS_BLOCK_OPS help This option enables support for the exporting pNFS SCSI layouts in the kernel's NFS server. The pNFS SCSI layout enables NFS @@ -111,6 +113,23 @@ config NFSD_SCSILAYOUT If unsure, say N. +config NFSD_FLEXFILELAYOUT + bool "NFSv4.1 server support for pNFS Flex File layouts" + depends on NFSD_V4 + select NFSD_PNFS + help + This option enables support for the exporting pNFS Flex File + layouts in the kernel's NFS server. The pNFS Flex File layout + enables NFS clients to directly perform I/O to NFSv3 devices + accesible to both the server and the clients. See + draft-ietf-nfsv4-flex-files for more details. + + Warning, this server implements the bare minimum functionality + to be a flex file server - it is for testing the client, + not for use in production. + + If unsure, say N. + config NFSD_V4_SECURITY_LABEL bool "Provide Security Label support for NFSv4 server" depends on NFSD_V4 && SECURITY diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 3ae5f3c77e28..5f5d3a76980c 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -20,3 +20,4 @@ nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o +nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index ad2c05e80a83..5a1708441510 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -163,6 +163,7 @@ nfsd4_block_get_device_info_simple(struct super_block *sb, static __be32 nfsd4_block_proc_getdeviceinfo(struct super_block *sb, + struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp) { @@ -355,6 +356,7 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb, static __be32 nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb, + struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp) { diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 4ebaaf4b8d8a..ac6f54546fdd 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -44,7 +44,7 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) switch (b->type) { case PNFS_BLOCK_VOLUME_SIMPLE: - len = 4 + 4 + 8 + 4 + b->simple.sig_len; + len = 4 + 4 + 8 + 4 + (XDR_QUADLEN(b->simple.sig_len) << 2); p = xdr_reserve_space(xdr, len); if (!p) return -ETOOSMALL; @@ -55,7 +55,7 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len); break; case PNFS_BLOCK_VOLUME_SCSI: - len = 4 + 4 + 4 + 4 + b->scsi.designator_len + 8; + len = 4 + 4 + 4 + 4 + (XDR_QUADLEN(b->scsi.designator_len) << 2) + 8; p = xdr_reserve_space(xdr, len); if (!p) return -ETOOSMALL; diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index b4d84b579f20..43e109cc0ccc 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -706,7 +706,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = 0; - new->ex_layout_type = 0; + new->ex_layout_types = 0; new->ex_uuid = NULL; new->cd = item->cd; } @@ -731,7 +731,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) item->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = item->ex_fslocs.migrated; item->ex_fslocs.migrated = 0; - new->ex_layout_type = item->ex_layout_type; + new->ex_layout_types = item->ex_layout_types; new->ex_nflavors = item->ex_nflavors; for (i = 0; i < MAX_SECINFO_LIST; i++) { new->ex_flavors[i] = item->ex_flavors[i]; @@ -954,6 +954,16 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX) return 0; } + + /* If the compound op contains a spo_must_allowed op, + * it will be sent with integrity/protection which + * will have to be expressly allowed on mounts that + * don't support it + */ + + if (nfsd4_spo_must_allow(rqstp)) + return 0; + return nfserr_wrongsec; } diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index 2e315072bf3f..730f15eeb7ed 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -57,7 +57,7 @@ struct svc_export { struct nfsd4_fs_locations ex_fslocs; uint32_t ex_nflavors; struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST]; - enum pnfs_layouttype ex_layout_type; + u32 ex_layout_types; struct nfsd4_deviceid_map *ex_devid_map; struct cache_detail *cd; }; diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c new file mode 100644 index 000000000000..df880e9fa71f --- /dev/null +++ b/fs/nfsd/flexfilelayout.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com> + * + * The following implements a super-simple flex-file server + * where the NFSv4.1 mds is also the ds. And the storage is + * the same. I.e., writing to the mds via a NFSv4.1 WRITE + * goes to the same location as the NFSv3 WRITE. + */ +#include <linux/slab.h> + +#include <linux/nfsd/debug.h> + +#include <linux/sunrpc/addr.h> + +#include "flexfilelayoutxdr.h" +#include "pnfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + +static __be32 +nfsd4_ff_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, + struct nfsd4_layoutget *args) +{ + struct nfsd4_layout_seg *seg = &args->lg_seg; + u32 device_generation = 0; + int error; + uid_t u; + + struct pnfs_ff_layout *fl; + + /* + * The super simple flex file server has 1 mirror, 1 data server, + * and 1 file handle. So instead of 4 allocs, do 1 for now. + * Zero it out for the stateid - don't want junk in there! + */ + error = -ENOMEM; + fl = kzalloc(sizeof(*fl), GFP_KERNEL); + if (!fl) + goto out_error; + args->lg_content = fl; + + /* + * Avoid layout commit, try to force the I/O to the DS, + * and for fun, cause all IOMODE_RW layout segments to + * effectively be WRITE only. + */ + fl->flags = FF_FLAGS_NO_LAYOUTCOMMIT | FF_FLAGS_NO_IO_THRU_MDS | + FF_FLAGS_NO_READ_IO; + + /* Do not allow a IOMODE_READ segment to have write pemissions */ + if (seg->iomode == IOMODE_READ) { + u = from_kuid(&init_user_ns, inode->i_uid) + 1; + fl->uid = make_kuid(&init_user_ns, u); + } else + fl->uid = inode->i_uid; + fl->gid = inode->i_gid; + + error = nfsd4_set_deviceid(&fl->deviceid, fhp, device_generation); + if (error) + goto out_error; + + fl->fh.size = fhp->fh_handle.fh_size; + memcpy(fl->fh.data, &fhp->fh_handle.fh_base, fl->fh.size); + + /* Give whole file layout segments */ + seg->offset = 0; + seg->length = NFS4_MAX_UINT64; + + dprintk("GET: 0x%llx:0x%llx %d\n", seg->offset, seg->length, + seg->iomode); + return 0; + +out_error: + seg->length = 0; + return nfserrno(error); +} + +static __be32 +nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp, + struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_ff_device_addr *da; + + u16 port; + char addr[INET6_ADDRSTRLEN]; + + da = kzalloc(sizeof(struct pnfs_ff_device_addr), GFP_KERNEL); + if (!da) + return nfserrno(-ENOMEM); + + gdp->gd_device = da; + + da->version = 3; + da->minor_version = 0; + + da->rsize = svc_max_payload(rqstp); + da->wsize = da->rsize; + + rpc_ntop((struct sockaddr *)&rqstp->rq_daddr, + addr, INET6_ADDRSTRLEN); + if (rqstp->rq_daddr.ss_family == AF_INET) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&rqstp->rq_daddr; + port = ntohs(sin->sin_port); + snprintf(da->netaddr.netid, FF_NETID_LEN + 1, "tcp"); + da->netaddr.netid_len = 3; + } else { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&rqstp->rq_daddr; + port = ntohs(sin6->sin6_port); + snprintf(da->netaddr.netid, FF_NETID_LEN + 1, "tcp6"); + da->netaddr.netid_len = 4; + } + + da->netaddr.addr_len = + snprintf(da->netaddr.addr, FF_ADDR_LEN + 1, + "%s.%hhu.%hhu", addr, port >> 8, port & 0xff); + + da->tightly_coupled = false; + + return 0; +} + +const struct nfsd4_layout_ops ff_layout_ops = { + .notify_types = + NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, + .proc_getdeviceinfo = nfsd4_ff_proc_getdeviceinfo, + .encode_getdeviceinfo = nfsd4_ff_encode_getdeviceinfo, + .proc_layoutget = nfsd4_ff_proc_layoutget, + .encode_layoutget = nfsd4_ff_encode_layoutget, +}; diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c new file mode 100644 index 000000000000..5e3fd7fc1a9f --- /dev/null +++ b/fs/nfsd/flexfilelayoutxdr.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com> + */ +#include <linux/sunrpc/svc.h> +#include <linux/nfs4.h> + +#include "nfsd.h" +#include "flexfilelayoutxdr.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + +struct ff_idmap { + char buf[11]; + int len; +}; + +__be32 +nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, + struct nfsd4_layoutget *lgp) +{ + struct pnfs_ff_layout *fl = lgp->lg_content; + int len, mirror_len, ds_len, fh_len; + __be32 *p; + + /* + * Unlike nfsd4_encode_user, we know these will + * always be stringified. + */ + struct ff_idmap uid; + struct ff_idmap gid; + + fh_len = 4 + fl->fh.size; + + uid.len = sprintf(uid.buf, "%u", from_kuid(&init_user_ns, fl->uid)); + gid.len = sprintf(gid.buf, "%u", from_kgid(&init_user_ns, fl->gid)); + + /* 8 + len for recording the length, name, and padding */ + ds_len = 20 + sizeof(stateid_opaque_t) + 4 + fh_len + + 8 + uid.len + 8 + gid.len; + + mirror_len = 4 + ds_len; + + /* The layout segment */ + len = 20 + mirror_len; + + p = xdr_reserve_space(xdr, sizeof(__be32) + len); + if (!p) + return nfserr_toosmall; + + *p++ = cpu_to_be32(len); + p = xdr_encode_hyper(p, 0); /* stripe unit of 1 */ + + *p++ = cpu_to_be32(1); /* single mirror */ + *p++ = cpu_to_be32(1); /* single data server */ + + p = xdr_encode_opaque_fixed(p, &fl->deviceid, + sizeof(struct nfsd4_deviceid)); + + *p++ = cpu_to_be32(1); /* efficiency */ + + *p++ = cpu_to_be32(fl->stateid.si_generation); + p = xdr_encode_opaque_fixed(p, &fl->stateid.si_opaque, + sizeof(stateid_opaque_t)); + + *p++ = cpu_to_be32(1); /* single file handle */ + p = xdr_encode_opaque(p, fl->fh.data, fl->fh.size); + + p = xdr_encode_opaque(p, uid.buf, uid.len); + p = xdr_encode_opaque(p, gid.buf, gid.len); + + *p++ = cpu_to_be32(fl->flags); + *p++ = cpu_to_be32(0); /* No stats collect hint */ + + return 0; +} + +__be32 +nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_ff_device_addr *da = gdp->gd_device; + int len; + int ver_len; + int addr_len; + __be32 *p; + + /* len + padding for two strings */ + addr_len = 16 + da->netaddr.netid_len + da->netaddr.addr_len; + ver_len = 20; + + len = 4 + ver_len + 4 + addr_len; + + p = xdr_reserve_space(xdr, len + sizeof(__be32)); + if (!p) + return nfserr_resource; + + /* + * Fill in the overall length and number of volumes at the beginning + * of the layout. + */ + *p++ = cpu_to_be32(len); + *p++ = cpu_to_be32(1); /* 1 netaddr */ + p = xdr_encode_opaque(p, da->netaddr.netid, da->netaddr.netid_len); + p = xdr_encode_opaque(p, da->netaddr.addr, da->netaddr.addr_len); + + *p++ = cpu_to_be32(1); /* 1 versions */ + + *p++ = cpu_to_be32(da->version); + *p++ = cpu_to_be32(da->minor_version); + *p++ = cpu_to_be32(da->rsize); + *p++ = cpu_to_be32(da->wsize); + *p++ = cpu_to_be32(da->tightly_coupled); + + return 0; +} diff --git a/fs/nfsd/flexfilelayoutxdr.h b/fs/nfsd/flexfilelayoutxdr.h new file mode 100644 index 000000000000..467defd4e563 --- /dev/null +++ b/fs/nfsd/flexfilelayoutxdr.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com> + */ +#ifndef _NFSD_FLEXFILELAYOUTXDR_H +#define _NFSD_FLEXFILELAYOUTXDR_H 1 + +#include <linux/inet.h> +#include "xdr4.h" + +#define FF_FLAGS_NO_LAYOUTCOMMIT 1 +#define FF_FLAGS_NO_IO_THRU_MDS 2 +#define FF_FLAGS_NO_READ_IO 4 + +struct xdr_stream; + +#define FF_NETID_LEN (4) +#define FF_ADDR_LEN (INET6_ADDRSTRLEN + 8) +struct pnfs_ff_netaddr { + char netid[FF_NETID_LEN + 1]; + char addr[FF_ADDR_LEN + 1]; + u32 netid_len; + u32 addr_len; +}; + +struct pnfs_ff_device_addr { + struct pnfs_ff_netaddr netaddr; + u32 version; + u32 minor_version; + u32 rsize; + u32 wsize; + bool tightly_coupled; +}; + +struct pnfs_ff_layout { + u32 flags; + u32 stats_collect_hint; + kuid_t uid; + kgid_t gid; + struct nfsd4_deviceid deviceid; + stateid_t stateid; + struct nfs_fh fh; +}; + +__be32 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdp); +__be32 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, + struct nfsd4_layoutget *lgp); + +#endif /* _NFSD_FLEXFILELAYOUTXDR_H */ diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 953c0755cb37..2be9602b0221 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -27,6 +27,9 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops; static const struct lock_manager_operations nfsd4_layouts_lm_ops; const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { +#ifdef CONFIG_NFSD_FLEXFILELAYOUT + [LAYOUT_FLEX_FILES] = &ff_layout_ops, +#endif #ifdef CONFIG_NFSD_BLOCKLAYOUT [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops, #endif @@ -122,28 +125,35 @@ nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp, void nfsd4_setup_layout_type(struct svc_export *exp) { +#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT) struct super_block *sb = exp->ex_path.mnt->mnt_sb; +#endif if (!(exp->ex_flags & NFSEXP_PNFS)) return; /* - * Check if the file system supports exporting a block-like layout. + * If flex file is configured, use it by default. Otherwise + * check if the file system supports exporting a block-like layout. * If the block device supports reservations prefer the SCSI layout, * otherwise advertise the block layout. */ +#ifdef CONFIG_NFSD_FLEXFILELAYOUT + exp->ex_layout_types |= 1 << LAYOUT_FLEX_FILES; +#endif #ifdef CONFIG_NFSD_BLOCKLAYOUT + /* overwrite flex file layout selection if needed */ if (sb->s_export_op->get_uuid && sb->s_export_op->map_blocks && sb->s_export_op->commit_blocks) - exp->ex_layout_type = LAYOUT_BLOCK_VOLUME; + exp->ex_layout_types |= 1 << LAYOUT_BLOCK_VOLUME; #endif #ifdef CONFIG_NFSD_SCSILAYOUT /* overwrite block layout selection if needed */ if (sb->s_export_op->map_blocks && sb->s_export_op->commit_blocks && sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops) - exp->ex_layout_type = LAYOUT_SCSI; + exp->ex_layout_types |= 1 << LAYOUT_SCSI; #endif } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index de1ff1d98bb1..1fb222752b2b 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -605,8 +605,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fh_init(&resfh, NFS4_FHSIZE); - status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, - NFSD_MAY_CREATE); + status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_NOP); if (status) return status; @@ -1219,12 +1218,12 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, static const struct nfsd4_layout_ops * nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type) { - if (!exp->ex_layout_type) { + if (!exp->ex_layout_types) { dprintk("%s: export does not support pNFS\n", __func__); return NULL; } - if (exp->ex_layout_type != layout_type) { + if (!(exp->ex_layout_types & (1 << layout_type))) { dprintk("%s: layout type %d not supported\n", __func__, layout_type); return NULL; @@ -1270,7 +1269,7 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp, nfserr = nfs_ok; if (gdp->gd_maxcount != 0) { nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, - cstate->session->se_client, gdp); + rqstp, cstate->session->se_client, gdp); } gdp->gd_notify_types &= ops->notify_types; @@ -2335,6 +2334,45 @@ static struct nfsd4_operation nfsd4_ops[] = { }, }; +/** + * nfsd4_spo_must_allow - Determine if the compound op contains an + * operation that is allowed to be sent with machine credentials + * + * @rqstp: a pointer to the struct svc_rqst + * + * Checks to see if the compound contains a spo_must_allow op + * and confirms that it was sent with the proper machine creds. + */ + +bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + struct nfsd4_op *this = &argp->ops[resp->opcnt - 1]; + struct nfsd4_compound_state *cstate = &resp->cstate; + struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow; + u32 opiter; + + if (!cstate->minorversion) + return false; + + if (cstate->spo_must_allowed == true) + return true; + + opiter = resp->opcnt; + while (opiter < argp->opcnt) { + this = &argp->ops[opiter++]; + if (test_bit(this->opnum, allow->u.longs) && + cstate->clp->cl_mach_cred && + nfsd4_mach_creds_match(cstate->clp, rqstp)) { + cstate->spo_must_allowed = true; + return true; + } + } + cstate->spo_must_allowed = false; + return false; +} + int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) { struct nfsd4_operation *opdesc; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 70d0b9b33031..8410ca275db1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1200,27 +1200,6 @@ free_ol_stateid_reaplist(struct list_head *reaplist) } } -static void release_lockowner(struct nfs4_lockowner *lo) -{ - struct nfs4_client *clp = lo->lo_owner.so_client; - struct nfs4_ol_stateid *stp; - struct list_head reaplist; - - INIT_LIST_HEAD(&reaplist); - - spin_lock(&clp->cl_lock); - unhash_lockowner_locked(lo); - while (!list_empty(&lo->lo_owner.so_stateids)) { - stp = list_first_entry(&lo->lo_owner.so_stateids, - struct nfs4_ol_stateid, st_perstateowner); - WARN_ON(!unhash_lock_stateid(stp)); - put_ol_stateid_locked(stp, &reaplist); - } - spin_unlock(&clp->cl_lock); - free_ol_stateid_reaplist(&reaplist); - nfs4_put_stateowner(&lo->lo_owner); -} - static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, struct list_head *reaplist) { @@ -1972,7 +1951,7 @@ static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp) service == RPC_GSS_SVC_PRIVACY; } -static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp) +bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp) { struct svc_cred *cr = &rqstp->rq_cred; @@ -2388,6 +2367,22 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, switch (exid->spa_how) { case SP4_MACH_CRED: + exid->spo_must_enforce[0] = 0; + exid->spo_must_enforce[1] = ( + 1 << (OP_BIND_CONN_TO_SESSION - 32) | + 1 << (OP_EXCHANGE_ID - 32) | + 1 << (OP_CREATE_SESSION - 32) | + 1 << (OP_DESTROY_SESSION - 32) | + 1 << (OP_DESTROY_CLIENTID - 32)); + + exid->spo_must_allow[0] &= (1 << (OP_CLOSE) | + 1 << (OP_OPEN_DOWNGRADE) | + 1 << (OP_LOCKU) | + 1 << (OP_DELEGRETURN)); + + exid->spo_must_allow[1] &= ( + 1 << (OP_TEST_STATEID - 32) | + 1 << (OP_FREE_STATEID - 32)); if (!svc_rqst_integrity_protected(rqstp)) { status = nfserr_inval; goto out_nolock; @@ -2424,7 +2419,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, status = nfserr_inval; goto out; } - if (!mach_creds_match(conf, rqstp)) { + if (!nfsd4_mach_creds_match(conf, rqstp)) { status = nfserr_wrong_cred; goto out; } @@ -2473,6 +2468,8 @@ out_new: goto out; } new->cl_minorversion = cstate->minorversion; + new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0]; + new->cl_spo_must_allow.u.words[1] = exid->spo_must_allow[1]; gen_clid(new, nn); add_to_unconfirmed(new); @@ -2676,7 +2673,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, if (conf) { status = nfserr_wrong_cred; - if (!mach_creds_match(conf, rqstp)) + if (!nfsd4_mach_creds_match(conf, rqstp)) goto out_free_conn; cs_slot = &conf->cl_cs_slot; status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); @@ -2692,7 +2689,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, goto out_free_conn; } status = nfserr_wrong_cred; - if (!mach_creds_match(unconf, rqstp)) + if (!nfsd4_mach_creds_match(unconf, rqstp)) goto out_free_conn; cs_slot = &unconf->cl_cs_slot; status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); @@ -2801,7 +2798,7 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, if (!session) goto out_no_session; status = nfserr_wrong_cred; - if (!mach_creds_match(session->se_client, rqstp)) + if (!nfsd4_mach_creds_match(session->se_client, rqstp)) goto out; status = nfsd4_map_bcts_dir(&bcts->dir); if (status) @@ -2848,7 +2845,7 @@ nfsd4_destroy_session(struct svc_rqst *r, if (!ses) goto out_client_lock; status = nfserr_wrong_cred; - if (!mach_creds_match(ses->se_client, r)) + if (!nfsd4_mach_creds_match(ses->se_client, r)) goto out_put_session; status = mark_session_dead_locked(ses, 1 + ref_held_by_me); if (status) @@ -3087,7 +3084,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta status = nfserr_stale_clientid; goto out; } - if (!mach_creds_match(clp, rqstp)) { + if (!nfsd4_mach_creds_match(clp, rqstp)) { clp = NULL; status = nfserr_wrong_cred; goto out; @@ -3112,7 +3109,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta * We don't take advantage of the rca_one_fs case. * That's OK, it's optional, we can safely ignore it. */ - return nfs_ok; + return nfs_ok; } status = nfserr_complete_already; @@ -5945,6 +5942,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct nfs4_client *clp; + LIST_HEAD (reaplist); dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); @@ -5975,9 +5973,23 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, nfs4_get_stateowner(sop); break; } + if (!lo) { + spin_unlock(&clp->cl_lock); + return status; + } + + unhash_lockowner_locked(lo); + while (!list_empty(&lo->lo_owner.so_stateids)) { + stp = list_first_entry(&lo->lo_owner.so_stateids, + struct nfs4_ol_stateid, + st_perstateowner); + WARN_ON(!unhash_lock_stateid(stp)); + put_ol_stateid_locked(stp, &reaplist); + } spin_unlock(&clp->cl_lock); - if (lo) - release_lockowner(lo); + free_ol_stateid_reaplist(&reaplist); + nfs4_put_stateowner(&lo->lo_owner); + return status; } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 9df898ba648f..0aa0236a1429 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1299,16 +1299,14 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, break; case SP4_MACH_CRED: /* spo_must_enforce */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy * 4); - p += dummy; - + status = nfsd4_decode_bitmap(argp, + exid->spo_must_enforce); + if (status) + goto out; /* spo_must_allow */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy * 4); - p += dummy; + status = nfsd4_decode_bitmap(argp, exid->spo_must_allow); + if (status) + goto out; break; case SP4_SSV: /* ssp_ops */ @@ -2164,22 +2162,20 @@ nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp, } static inline __be32 -nfsd4_encode_layout_type(struct xdr_stream *xdr, enum pnfs_layouttype layout_type) +nfsd4_encode_layout_types(struct xdr_stream *xdr, u32 layout_types) { - __be32 *p; + __be32 *p; + unsigned long i = hweight_long(layout_types); - if (layout_type) { - p = xdr_reserve_space(xdr, 8); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(1); - *p++ = cpu_to_be32(layout_type); - } else { - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(0); - } + p = xdr_reserve_space(xdr, 4 + 4 * i); + if (!p) + return nfserr_resource; + + *p++ = cpu_to_be32(i); + + for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i) + if (layout_types & (1 << i)) + *p++ = cpu_to_be32(i); return 0; } @@ -2754,13 +2750,13 @@ out_acl: } #ifdef CONFIG_NFSD_PNFS if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { - status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type); + status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types); if (status) goto out; } if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) { - status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type); + status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types); if (status) goto out; } @@ -3867,14 +3863,6 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w return nfserr; } -static const u32 nfs4_minimal_spo_must_enforce[2] = { - [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) | - 1 << (OP_EXCHANGE_ID - 32) | - 1 << (OP_CREATE_SESSION - 32) | - 1 << (OP_DESTROY_SESSION - 32) | - 1 << (OP_DESTROY_CLIENTID - 32) -}; - static __be32 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_exchange_id *exid) @@ -3885,6 +3873,7 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, char *server_scope; int major_id_sz; int server_scope_sz; + int status = 0; uint64_t minor_id = 0; if (nfserr) @@ -3913,18 +3902,20 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, case SP4_NONE: break; case SP4_MACH_CRED: - /* spo_must_enforce, spo_must_allow */ - p = xdr_reserve_space(xdr, 16); - if (!p) - return nfserr_resource; - /* spo_must_enforce bitmap: */ - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[0]); - *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[1]); - /* empty spo_must_allow bitmap: */ - *p++ = cpu_to_be32(0); - + status = nfsd4_encode_bitmap(xdr, + exid->spo_must_enforce[0], + exid->spo_must_enforce[1], + exid->spo_must_enforce[2]); + if (status) + goto out; + /* spo_must_allow bitmap: */ + status = nfsd4_encode_bitmap(xdr, + exid->spo_must_allow[0], + exid->spo_must_allow[1], + exid->spo_must_allow[2]); + if (status) + goto out; break; default: WARN_ON_ONCE(1); @@ -3951,6 +3942,8 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, /* Implementation id */ *p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */ return 0; +out: + return status; } static __be32 diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index cf980523898b..9446849888d5 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -124,6 +124,7 @@ void nfs4_state_shutdown_net(struct net *net); void nfs4_reset_lease(time_t leasetime); int nfs4_reset_recoverydir(char *recdir); char * nfs4_recoverydir(void); +bool nfsd4_spo_must_allow(struct svc_rqst *rqstp); #else static inline int nfsd4_init_slabs(void) { return 0; } static inline void nfsd4_free_slabs(void) { } @@ -134,6 +135,10 @@ static inline void nfs4_state_shutdown_net(struct net *net) { } static inline void nfs4_reset_lease(time_t leasetime) { } static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } static inline char * nfs4_recoverydir(void) {return NULL; } +static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) +{ + return false; +} #endif /* diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index a8919444c460..cfe7500d5847 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -59,14 +59,20 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) * the write call). */ static inline __be32 -nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, umode_t requested) +nfsd_mode_check(struct svc_rqst *rqstp, struct dentry *dentry, + umode_t requested) { - mode &= S_IFMT; + umode_t mode = d_inode(dentry)->i_mode & S_IFMT; if (requested == 0) /* the caller doesn't care */ return nfs_ok; - if (mode == requested) + if (mode == requested) { + if (mode == S_IFDIR && !d_can_lookup(dentry)) { + WARN_ON_ONCE(1); + return nfserr_notdir; + } return nfs_ok; + } /* * v4 has an error more specific than err_notdir which we should * return in preference to err_notdir: @@ -298,7 +304,7 @@ out: * that it expects something not of the given type. * * @access is formed from the NFSD_MAY_* constants defined in - * include/linux/nfsd/nfsd.h. + * fs/nfsd/vfs.h. */ __be32 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) @@ -340,7 +346,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) if (error) goto out; - error = nfsd_mode_check(rqstp, d_inode(dentry)->i_mode, type); + error = nfsd_mode_check(rqstp, dentry, type); if (error) goto out; @@ -533,7 +539,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, * the reference filehandle (if it is in the same export) * or the export options. */ - set_version_and_fsid_type(fhp, exp, ref_fh); + set_version_and_fsid_type(fhp, exp, ref_fh); if (ref_fh == fhp) fh_put(ref_fh); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 4cd78ef4c95c..e9214768cde9 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -251,9 +251,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, /* Check for NFSD_MAY_WRITE in nfsd_create if necessary */ - nfserr = nfserr_acces; - if (!argp->len) - goto done; nfserr = nfserr_exist; if (isdotent(argp->name, argp->len)) goto done; @@ -362,8 +359,8 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, nfserr = 0; if (!inode) { /* File doesn't exist. Create it and set attrs */ - nfserr = nfsd_create(rqstp, dirfhp, argp->name, argp->len, - attr, type, rdev, newfhp); + nfserr = nfsd_create_locked(rqstp, dirfhp, argp->name, + argp->len, attr, type, rdev, newfhp); } else if (type == S_IFREG) { dprintk("nfsd: existing %s, valid=%x, size=%ld\n", argp->name, attr->ia_valid, (long) attr->ia_size); diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 79d964aa8079..41b468a6a90f 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -240,7 +240,7 @@ nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p, || !(p = decode_filename(p, &args->name, &args->len))) return 0; - return xdr_argsize_check(rqstp, p); + return xdr_argsize_check(rqstp, p); } int diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index 7d073b9b1553..0c2a716e8741 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -21,6 +21,7 @@ struct nfsd4_layout_ops { u32 notify_types; __be32 (*proc_getdeviceinfo)(struct super_block *sb, + struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdevp); __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr, @@ -44,6 +45,9 @@ extern const struct nfsd4_layout_ops bl_layout_ops; #ifdef CONFIG_NFSD_SCSILAYOUT extern const struct nfsd4_layout_ops scsi_layout_ops; #endif +#ifdef CONFIG_NFSD_FLEXFILELAYOUT +extern const struct nfsd4_layout_ops ff_layout_ops; +#endif __be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stateid_t *stateid, diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 64053eadeb81..b95adf9a1595 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -345,6 +345,7 @@ struct nfs4_client { u32 cl_exchange_flags; /* number of rpc's in progress over an associated session: */ atomic_t cl_refcount; + struct nfs4_op_map cl_spo_must_allow; /* for nfs41 callbacks */ /* We currently support a single back channel with a single slot */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6fbd81ecb410..ba944123167b 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1135,96 +1135,37 @@ nfsd_check_ignore_resizing(struct iattr *iap) iap->ia_valid &= ~ATTR_SIZE; } -/* - * Create a file (regular, directory, device, fifo); UNIX sockets - * not yet implemented. - * If the response fh has been verified, the parent directory should - * already be locked. Note that the parent directory is left locked. - * - * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp - */ +/* The parent directory should already be locked: */ __be32 -nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, +nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, char *fname, int flen, struct iattr *iap, int type, dev_t rdev, struct svc_fh *resfhp) { - struct dentry *dentry, *dchild = NULL; + struct dentry *dentry, *dchild; struct inode *dirp; __be32 err; __be32 err2; int host_err; - err = nfserr_perm; - if (!flen) - goto out; - err = nfserr_exist; - if (isdotent(fname, flen)) - goto out; - - err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); - if (err) - goto out; - dentry = fhp->fh_dentry; dirp = d_inode(dentry); - err = nfserr_notdir; - if (!dirp->i_op->lookup) - goto out; - /* - * Check whether the response file handle has been verified yet. - * If it has, the parent directory should already be locked. - */ - if (!resfhp->fh_dentry) { - host_err = fh_want_write(fhp); - if (host_err) - goto out_nfserr; - - /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ - fh_lock_nested(fhp, I_MUTEX_PARENT); - dchild = lookup_one_len(fname, dentry, flen); - host_err = PTR_ERR(dchild); - if (IS_ERR(dchild)) - goto out_nfserr; - err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); - if (err) - goto out; - } else { - /* called from nfsd_proc_create */ - dchild = dget(resfhp->fh_dentry); - if (!fhp->fh_locked) { - /* not actually possible */ - printk(KERN_ERR - "nfsd_create: parent %pd2 not locked!\n", + dchild = dget(resfhp->fh_dentry); + if (!fhp->fh_locked) { + WARN_ONCE(1, "nfsd_create: parent %pd2 not locked!\n", dentry); - err = nfserr_io; - goto out; - } - } - /* - * Make sure the child dentry is still negative ... - */ - err = nfserr_exist; - if (d_really_is_positive(dchild)) { - dprintk("nfsd_create: dentry %pd/%pd not negative!\n", - dentry, dchild); - goto out; + err = nfserr_io; + goto out; } + err = nfsd_permission(rqstp, fhp->fh_export, dentry, NFSD_MAY_CREATE); + if (err) + goto out; + if (!(iap->ia_valid & ATTR_MODE)) iap->ia_mode = 0; iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type; - err = nfserr_inval; - if (!S_ISREG(type) && !S_ISDIR(type) && !special_file(type)) { - printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n", - type); - goto out; - } - - /* - * Get the dir op function pointer. - */ err = 0; host_err = 0; switch (type) { @@ -1242,6 +1183,10 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, case S_IFSOCK: host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); break; + default: + printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n", + type); + host_err = -EINVAL; } if (host_err < 0) goto out_nfserr; @@ -1251,7 +1196,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, /* * nfsd_create_setattr already committed the child. Transactional * filesystems had a chance to commit changes for both parent and - * child * simultaneously making the following commit_metadata a + * child simultaneously making the following commit_metadata a * noop. */ err2 = nfserrno(commit_metadata(fhp)); @@ -1263,8 +1208,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!err) err = fh_update(resfhp); out: - if (dchild && !IS_ERR(dchild)) - dput(dchild); + dput(dchild); return err; out_nfserr: @@ -1272,6 +1216,50 @@ out_nfserr: goto out; } +/* + * Create a filesystem object (regular, directory, special). + * Note that the parent directory is left locked. + * + * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp + */ +__be32 +nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *fname, int flen, struct iattr *iap, + int type, dev_t rdev, struct svc_fh *resfhp) +{ + struct dentry *dentry, *dchild = NULL; + struct inode *dirp; + __be32 err; + int host_err; + + if (isdotent(fname, flen)) + return nfserr_exist; + + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_NOP); + if (err) + return err; + + dentry = fhp->fh_dentry; + dirp = d_inode(dentry); + + host_err = fh_want_write(fhp); + if (host_err) + return nfserrno(host_err); + + fh_lock_nested(fhp, I_MUTEX_PARENT); + dchild = lookup_one_len(fname, dentry, flen); + host_err = PTR_ERR(dchild); + if (IS_ERR(dchild)) + return nfserrno(host_err); + err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); + if (err) { + dput(dchild); + return err; + } + return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type, + rdev, resfhp); +} + #ifdef CONFIG_NFSD_V3 /* @@ -1304,12 +1292,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = fhp->fh_dentry; dirp = d_inode(dentry); - /* Get all the sanity checks out of the way before - * we lock the parent. */ - err = nfserr_notdir; - if (!dirp->i_op->lookup) - goto out; - host_err = fh_want_write(fhp); if (host_err) goto out_nfserr; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 2d573ec057f8..3cbb1b33777b 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -59,6 +59,9 @@ __be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, __be32 nfsd4_clone_file_range(struct file *, u64, struct file *, u64, u64); #endif /* CONFIG_NFSD_V4 */ +__be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + int type, dev_t rdev, struct svc_fh *res); __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, int type, dev_t rdev, struct svc_fh *res); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index d9554813e58a..beea0c5edc51 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -59,6 +59,7 @@ struct nfsd4_compound_state { struct nfsd4_session *session; struct nfsd4_slot *slot; int data_offset; + bool spo_must_allowed; size_t iovlen; u32 minorversion; __be32 status; @@ -403,6 +404,8 @@ struct nfsd4_exchange_id { clientid_t clientid; u32 seqid; int spa_how; + u32 spo_must_enforce[3]; + u32 spo_must_allow[3]; }; struct nfsd4_sequence { @@ -654,6 +657,8 @@ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) } + +bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp); int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *); int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *, struct nfsd4_compoundargs *); diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 9718da86ad01..821b34816976 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -100,10 +100,6 @@ static int switch_gc_head(struct ubifs_info *c) if (err) return err; - err = ubifs_wbuf_sync_nolock(wbuf); - if (err) - return err; - err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0); if (err) return err; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 70349954e78b..4ec051089186 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -520,19 +520,19 @@ static int init_constants_early(struct ubifs_info *c) c->max_write_shift = fls(c->max_write_size) - 1; if (c->leb_size < UBIFS_MIN_LEB_SZ) { - ubifs_err(c, "too small LEBs (%d bytes), min. is %d bytes", - c->leb_size, UBIFS_MIN_LEB_SZ); + ubifs_errc(c, "too small LEBs (%d bytes), min. is %d bytes", + c->leb_size, UBIFS_MIN_LEB_SZ); return -EINVAL; } if (c->leb_cnt < UBIFS_MIN_LEB_CNT) { - ubifs_err(c, "too few LEBs (%d), min. is %d", - c->leb_cnt, UBIFS_MIN_LEB_CNT); + ubifs_errc(c, "too few LEBs (%d), min. is %d", + c->leb_cnt, UBIFS_MIN_LEB_CNT); return -EINVAL; } if (!is_power_of_2(c->min_io_size)) { - ubifs_err(c, "bad min. I/O size %d", c->min_io_size); + ubifs_errc(c, "bad min. I/O size %d", c->min_io_size); return -EINVAL; } @@ -543,8 +543,8 @@ static int init_constants_early(struct ubifs_info *c) if (c->max_write_size < c->min_io_size || c->max_write_size % c->min_io_size || !is_power_of_2(c->max_write_size)) { - ubifs_err(c, "bad write buffer size %d for %d min. I/O unit", - c->max_write_size, c->min_io_size); + ubifs_errc(c, "bad write buffer size %d for %d min. I/O unit", + c->max_write_size, c->min_io_size); return -EINVAL; } @@ -2108,8 +2108,9 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, */ ubi = open_ubi(name, UBI_READONLY); if (IS_ERR(ubi)) { - pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d", - current->pid, name, (int)PTR_ERR(ubi)); + if (!(flags & MS_SILENT)) + pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d", + current->pid, name, (int)PTR_ERR(ubi)); return ERR_CAST(ubi); } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index ddf9f6b9eee2..4617d459022a 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1783,8 +1783,8 @@ void ubifs_err(const struct ubifs_info *c, const char *fmt, ...); __printf(2, 3) void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...); /* - * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description - * object as an argument. + * A conditional variant of 'ubifs_err()' which doesn't output anything + * if probing (ie. MS_SILENT set). */ #define ubifs_errc(c, fmt, ...) \ do { \ diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index b5fc27969e9d..e237811f09ce 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -592,19 +592,19 @@ static int ubifs_xattr_set(const struct xattr_handler *handler, return __ubifs_removexattr(inode, name); } -const struct xattr_handler ubifs_user_xattr_handler = { +static const struct xattr_handler ubifs_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, .get = ubifs_xattr_get, .set = ubifs_xattr_set, }; -const struct xattr_handler ubifs_trusted_xattr_handler = { +static const struct xattr_handler ubifs_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = ubifs_xattr_get, .set = ubifs_xattr_set, }; -const struct xattr_handler ubifs_security_xattr_handler = { +static const struct xattr_handler ubifs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = ubifs_xattr_get, .set = ubifs_xattr_set, diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 3542d94fddce..52c288514be1 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -121,5 +121,4 @@ xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o -xfs-$(CONFIG_NFSD_BLOCKLAYOUT) += xfs_pnfs.o -xfs-$(CONFIG_NFSD_SCSILAYOUT) += xfs_pnfs.o +xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index a1b2dd828b9d..fe1bfee35898 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -246,7 +246,7 @@ const struct export_operations xfs_export_operations = { .fh_to_parent = xfs_fs_fh_to_parent, .get_parent = xfs_fs_get_parent, .commit_metadata = xfs_fs_nfs_commit_metadata, -#ifdef CONFIG_NFSD_BLOCKLAYOUT +#ifdef CONFIG_EXPORTFS_BLOCK_OPS .get_uuid = xfs_fs_get_uuid, .map_blocks = xfs_fs_map_blocks, .commit_blocks = xfs_fs_commit_blocks, diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index 93f74853961b..e8339f74966b 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -1,7 +1,7 @@ #ifndef _XFS_PNFS_H #define _XFS_PNFS_H 1 -#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT) +#ifdef CONFIG_EXPORTFS_BLOCK_OPS int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset); int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, struct iomap *iomap, bool write, u32 *device_generation); @@ -15,5 +15,5 @@ xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex) { return 0; } -#endif /* CONFIG_NFSD_PNFS */ +#endif /* CONFIG_EXPORTFS_BLOCK_OPS */ #endif /* _XFS_PNFS_H */ diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 314b3caa701c..1303b570b18c 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -113,6 +113,8 @@ extern int suid_dumpable; extern int setup_arg_pages(struct linux_binprm * bprm, unsigned long stack_top, int executable_stack); +extern int transfer_args_to_stack(struct linux_binprm *bprm, + unsigned long *sp_location); extern int bprm_change_interp(char *interp, struct linux_binprm *bprm); extern int copy_strings_kernel(int argc, const char *const *argv, struct linux_binprm *bprm); diff --git a/include/linux/ds17287rtc.h b/include/linux/ds17287rtc.h deleted file mode 100644 index d85d3f497b96..000000000000 --- a/include/linux/ds17287rtc.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * ds17287rtc.h - register definitions for the ds1728[57] RTC / CMOS RAM - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * (C) 2003 Guido Guenther <agx@sigxcpu.org> - */ -#ifndef __LINUX_DS17287RTC_H -#define __LINUX_DS17287RTC_H - -#include <linux/rtc.h> /* get the user-level API */ -#include <linux/mc146818rtc.h> - -/* Register A */ -#define DS_REGA_DV2 0x40 /* countdown chain */ -#define DS_REGA_DV1 0x20 /* oscillator enable */ -#define DS_REGA_DV0 0x10 /* bank select */ - -/* bank 1 registers */ -#define DS_B1_MODEL 0x40 /* model number byte */ -#define DS_B1_SN1 0x41 /* serial number byte 1 */ -#define DS_B1_SN2 0x42 /* serial number byte 2 */ -#define DS_B1_SN3 0x43 /* serial number byte 3 */ -#define DS_B1_SN4 0x44 /* serial number byte 4 */ -#define DS_B1_SN5 0x45 /* serial number byte 5 */ -#define DS_B1_SN6 0x46 /* serial number byte 6 */ -#define DS_B1_CRC 0x47 /* CRC byte */ -#define DS_B1_CENTURY 0x48 /* Century byte */ -#define DS_B1_DALARM 0x49 /* date alarm */ -#define DS_B1_XCTRL4A 0x4a /* extendec control register 4a */ -#define DS_B1_XCTRL4B 0x4b /* extendec control register 4b */ -#define DS_B1_RTCADDR2 0x4e /* rtc address 2 */ -#define DS_B1_RTCADDR3 0x4f /* rtc address 3 */ -#define DS_B1_RAMLSB 0x50 /* extended ram LSB */ -#define DS_B1_RAMMSB 0x51 /* extended ram MSB */ -#define DS_B1_RAMDPORT 0x53 /* extended ram data port */ - -/* register details */ -/* extended control register 4a */ -#define DS_XCTRL4A_VRT2 0x80 /* valid ram and time */ -#define DS_XCTRL4A_INCR 0x40 /* increment progress status */ -#define DS_XCTRL4A_BME 0x20 /* burst mode enable */ -#define DS_XCTRL4A_PAB 0x08 /* power active bar ctrl */ -#define DS_XCTRL4A_RF 0x04 /* ram clear flag */ -#define DS_XCTRL4A_WF 0x02 /* wake up alarm flag */ -#define DS_XCTRL4A_KF 0x01 /* kickstart flag */ - -/* interrupt causes */ -#define DS_XCTRL4A_IFS (DS_XCTRL4A_RF|DS_XCTRL4A_WF|DS_XCTRL4A_KF) - -/* extended control register 4b */ -#define DS_XCTRL4B_ABE 0x80 /* auxiliary battery enable */ -#define DS_XCTRL4B_E32K 0x40 /* enable 32.768 kHz Output */ -#define DS_XCTRL4B_CS 0x20 /* crystal select */ -#define DS_XCTRL4B_RCE 0x10 /* ram clear enable */ -#define DS_XCTRL4B_PRS 0x08 /* PAB resec select */ -#define DS_XCTRL4B_RIE 0x04 /* ram clear interrupt enable */ -#define DS_XCTRL4B_WFE 0x02 /* wake up alarm interrupt enable */ -#define DS_XCTRL4B_KFE 0x01 /* kickstart interrupt enable */ - -/* interrupt enable bits */ -#define DS_XCTRL4B_IFES (DS_XCTRL4B_RIE|DS_XCTRL4B_WFE|DS_XCTRL4B_KFE) - -#endif /* __LINUX_DS17287RTC_H */ diff --git a/include/linux/i8042.h b/include/linux/i8042.h index 0f9bafa17a02..d98780ca9604 100644 --- a/include/linux/i8042.h +++ b/include/linux/i8042.h @@ -62,7 +62,6 @@ struct serio; void i8042_lock_chip(void); void i8042_unlock_chip(void); int i8042_command(unsigned char *param, int command); -bool i8042_check_port_owner(const struct serio *); int i8042_install_filter(bool (*filter)(unsigned char data, unsigned char str, struct serio *serio)); int i8042_remove_filter(bool (*filter)(unsigned char data, unsigned char str, @@ -83,11 +82,6 @@ static inline int i8042_command(unsigned char *param, int command) return -ENODEV; } -static inline bool i8042_check_port_owner(const struct serio *serio) -{ - return false; -} - static inline int i8042_install_filter(bool (*filter)(unsigned char data, unsigned char str, struct serio *serio)) { diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h index 433e0c74d643..a585b4b5fa0e 100644 --- a/include/linux/mc146818rtc.h +++ b/include/linux/mc146818rtc.h @@ -14,6 +14,8 @@ #include <asm/io.h> #include <linux/rtc.h> /* get the user-level API */ #include <asm/mc146818rtc.h> /* register access macros */ +#include <linux/bcd.h> +#include <linux/delay.h> #ifdef __KERNEL__ #include <linux/spinlock.h> /* spinlock_t */ @@ -120,4 +122,7 @@ struct cmos_rtc_board_info { #define RTC_IO_EXTENT_USED RTC_IO_EXTENT #endif /* ARCH_RTC_LOCATION */ +unsigned int mc146818_get_time(struct rtc_time *time); +int mc146818_set_time(struct rtc_time *time); + #endif /* _MC146818RTC_H */ diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index e6f6910278f3..42da3552f7cb 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -220,6 +220,7 @@ enum { MLX4_DEV_CAP_FLAG2_LB_SRC_CHK = 1ULL << 32, MLX4_DEV_CAP_FLAG2_ROCE_V1_V2 = 1ULL << 33, MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER = 1ULL << 34, + MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT = 1ULL << 35, }; enum { @@ -1342,6 +1343,9 @@ enum { VXLAN_STEER_BY_INNER_VLAN = 1 << 4, }; +enum { + MLX4_OP_MOD_QUERY_TRANSPORT_CI_ERRORS = 0x2, +}; int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port, u32 qpn, enum mlx4_net_trans_promisc_mode mode); @@ -1382,6 +1386,9 @@ void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr, int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr); int mlx4_SYNC_TPT(struct mlx4_dev *dev); int mlx4_test_interrupts(struct mlx4_dev *dev); +int mlx4_query_diag_counters(struct mlx4_dev *dev, u8 op_modifier, + const u32 offset[], u32 value[], + size_t array_len, u8 port); u32 mlx4_get_eqs_per_port(struct mlx4_dev *dev, u8 port); bool mlx4_is_eq_vector_valid(struct mlx4_dev *dev, u8 port, int vector); struct cpu_rmap *mlx4_get_cpu_rmap(struct mlx4_dev *dev, int port); diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 2be976dd4966..2566f6d6444f 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -58,6 +58,8 @@ struct mlx5_core_cq { void (*comp)(struct mlx5_core_cq *); void *priv; } tasklet_ctx; + int reset_notify_added; + struct list_head reset_notify; }; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index a041b99fceac..ccea6fb16482 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -46,6 +46,7 @@ #include <linux/mlx5/device.h> #include <linux/mlx5/doorbell.h> +#include <linux/mlx5/srq.h> enum { MLX5_RQ_BITMASK_VSD = 1 << 1, @@ -798,11 +799,10 @@ struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev, void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev, struct mlx5_cmd_mailbox *head); int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, - struct mlx5_create_srq_mbox_in *in, int inlen, - int is_xrc); + struct mlx5_srq_attr *in); int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq); int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, - struct mlx5_query_srq_mbox_out *out); + struct mlx5_srq_attr *out); int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, u16 lwm, int is_srq); void mlx5_init_mkey_table(struct mlx5_core_dev *dev); diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index ab310819ac36..7879bf411891 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -556,9 +556,9 @@ struct mlx5_destroy_qp_mbox_out { struct mlx5_modify_qp_mbox_in { struct mlx5_inbox_hdr hdr; __be32 qpn; - u8 rsvd1[4]; - __be32 optparam; u8 rsvd0[4]; + __be32 optparam; + u8 rsvd1[4]; struct mlx5_qp_context ctx; u8 rsvd2[16]; }; diff --git a/include/linux/mlx5/srq.h b/include/linux/mlx5/srq.h index f43ed054a3e0..33c97dc900f8 100644 --- a/include/linux/mlx5/srq.h +++ b/include/linux/mlx5/srq.h @@ -35,6 +35,31 @@ #include <linux/mlx5/driver.h> +enum { + MLX5_SRQ_FLAG_ERR = (1 << 0), + MLX5_SRQ_FLAG_WQ_SIG = (1 << 1), +}; + +struct mlx5_srq_attr { + u32 type; + u32 flags; + u32 log_size; + u32 wqe_shift; + u32 log_page_size; + u32 wqe_cnt; + u32 srqn; + u32 xrcd; + u32 page_offset; + u32 cqn; + u32 pd; + u32 lwm; + u32 user_index; + u64 db_record; + u64 *pas; +}; + +struct mlx5_core_dev; + void mlx5_init_srq_table(struct mlx5_core_dev *dev); void mlx5_cleanup_srq_table(struct mlx5_core_dev *dev); diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index bfed6b367350..c6564ada9beb 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -643,4 +643,15 @@ enum pnfs_update_layout_reason { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, }; +#define NFS4_OP_MAP_NUM_LONGS \ + DIV_ROUND_UP(LAST_NFS4_OP, 8 * sizeof(unsigned long)) +#define NFS4_OP_MAP_NUM_WORDS \ + (NFS4_OP_MAP_NUM_LONGS * sizeof(unsigned long) / sizeof(u32)) +struct nfs4_op_map { + union { + unsigned long longs[NFS4_OP_MAP_NUM_LONGS]; + u32 words[NFS4_OP_MAP_NUM_WORDS]; + } u; +}; + #endif diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 82b81a1c2438..5bcbbe511be6 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1185,17 +1185,6 @@ struct pnfs_ds_commit_info { struct pnfs_commit_bucket *buckets; }; -#define NFS4_OP_MAP_NUM_LONGS \ - DIV_ROUND_UP(LAST_NFS4_OP, 8 * sizeof(unsigned long)) -#define NFS4_OP_MAP_NUM_WORDS \ - (NFS4_OP_MAP_NUM_LONGS * sizeof(unsigned long) / sizeof(u32)) -struct nfs4_op_map { - union { - unsigned long longs[NFS4_OP_MAP_NUM_LONGS]; - u32 words[NFS4_OP_MAP_NUM_WORDS]; - } u; -}; - struct nfs41_state_protection { u32 how; struct nfs4_op_map enforce; diff --git a/include/linux/rtc-ds2404.h b/include/linux/platform_data/rtc-ds2404.h index 22c53825528f..22c53825528f 100644 --- a/include/linux/rtc-ds2404.h +++ b/include/linux/platform_data/rtc-ds2404.h diff --git a/include/linux/m48t86.h b/include/linux/platform_data/rtc-m48t86.h index 915d6b4f0f89..915d6b4f0f89 100644 --- a/include/linux/m48t86.h +++ b/include/linux/platform_data/rtc-m48t86.h diff --git a/include/linux/rtc-v3020.h b/include/linux/platform_data/rtc-v3020.h index e55d82cebf80..e55d82cebf80 100644 --- a/include/linux/rtc-v3020.h +++ b/include/linux/platform_data/rtc-v3020.h diff --git a/include/linux/ds1286.h b/include/linux/rtc/ds1286.h index 45ea0aa0aeb9..45ea0aa0aeb9 100644 --- a/include/linux/ds1286.h +++ b/include/linux/rtc/ds1286.h diff --git a/include/linux/serio.h b/include/linux/serio.h index df4ab5de1586..c733cff44e18 100644 --- a/include/linux/serio.h +++ b/include/linux/serio.h @@ -31,7 +31,8 @@ struct serio { struct serio_device_id id; - spinlock_t lock; /* protects critical sections from port's interrupt handler */ + /* Protects critical sections from port's interrupt handler */ + spinlock_t lock; int (*write)(struct serio *, unsigned char); int (*open)(struct serio *); @@ -40,16 +41,29 @@ struct serio { void (*stop)(struct serio *); struct serio *parent; - struct list_head child_node; /* Entry in parent->children list */ + /* Entry in parent->children list */ + struct list_head child_node; struct list_head children; - unsigned int depth; /* level of nesting in serio hierarchy */ + /* Level of nesting in serio hierarchy */ + unsigned int depth; - struct serio_driver *drv; /* accessed from interrupt, must be protected by serio->lock and serio->sem */ - struct mutex drv_mutex; /* protects serio->drv so attributes can pin driver */ + /* + * serio->drv is accessed from interrupt handlers; when modifying + * caller should acquire serio->drv_mutex and serio->lock. + */ + struct serio_driver *drv; + /* Protects serio->drv so attributes can pin current driver */ + struct mutex drv_mutex; struct device dev; struct list_head node; + + /* + * For use by PS/2 layer when several ports share hardware and + * may get indigestion when exposed to concurrent access (i8042). + */ + struct mutex *ps2_cmd_mutex; }; #define to_serio_port(d) container_of(d, struct serio, dev) diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index ed03c9f7f908..62a60eeacb0a 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -78,8 +78,6 @@ struct cache_detail { struct hlist_head * hash_table; rwlock_t hash_lock; - atomic_t inuse; /* active user-space update or lookup */ - char *name; void (*cache_put)(struct kref *); diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 7ca44fb5b675..7321ae933867 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -268,6 +268,7 @@ struct svc_rqst { * cache pages */ #define RQ_VICTIM (5) /* about to be shut down */ #define RQ_BUSY (6) /* request is busy */ +#define RQ_DATA (7) /* request has data */ unsigned long rq_flags; /* flags field */ void * rq_argp; /* decoded arguments */ diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 79ba50856707..ab02a457da1f 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -25,7 +25,6 @@ struct svc_xprt_ops { void (*xpo_detach)(struct svc_xprt *); void (*xpo_free)(struct svc_xprt *); int (*xpo_secure_port)(struct svc_rqst *); - void (*xpo_adjust_wspace)(struct svc_xprt *); }; struct svc_xprt_class { @@ -69,6 +68,7 @@ struct svc_xprt { struct svc_serv *xpt_server; /* service for transport */ atomic_t xpt_reserved; /* space on outq that is rsvd */ + atomic_t xpt_nr_rqsts; /* Number of requests */ struct mutex xpt_mutex; /* to serialize sending data */ spinlock_t xpt_lock; /* protects sk_deferred * and xpt_auth_cache */ diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index 384041669489..5ee7aab95eb8 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -94,6 +94,19 @@ enum ib_sa_selector { IB_SA_BEST = 3 }; +/* + * There are 4 types of join states: + * FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember. + * The order corresponds to JoinState bits in MCMemberRecord. + */ +enum ib_sa_mc_join_states { + FULLMEMBER_JOIN, + NONMEMBER_JOIN, + SENDONLY_NONMEBER_JOIN, + SENDONLY_FULLMEMBER_JOIN, + NUM_JOIN_MEMBERSHIP_TYPES, +}; + #define IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT BIT(12) /* diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a8137dcf5a00..8e90dd28bb75 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -562,6 +562,7 @@ enum ib_event_type { IB_EVENT_QP_LAST_WQE_REACHED, IB_EVENT_CLIENT_REREGISTER, IB_EVENT_GID_CHANGE, + IB_EVENT_WQ_FATAL, }; const char *__attribute_const__ ib_event_msg(enum ib_event_type event); @@ -572,6 +573,7 @@ struct ib_event { struct ib_cq *cq; struct ib_qp *qp; struct ib_srq *srq; + struct ib_wq *wq; u8 port_num; } element; enum ib_event_type event; @@ -1015,6 +1017,7 @@ struct ib_qp_init_attr { * Only needed for special QP types, or when using the RW API. */ u8 port_num; + struct ib_rwq_ind_table *rwq_ind_tbl; }; struct ib_qp_open_attr { @@ -1323,6 +1326,8 @@ struct ib_ucontext { struct list_head ah_list; struct list_head xrcd_list; struct list_head rule_list; + struct list_head wq_list; + struct list_head rwq_ind_tbl_list; int closing; struct pid *tgid; @@ -1428,6 +1433,67 @@ struct ib_srq { } ext; }; +enum ib_wq_type { + IB_WQT_RQ +}; + +enum ib_wq_state { + IB_WQS_RESET, + IB_WQS_RDY, + IB_WQS_ERR +}; + +struct ib_wq { + struct ib_device *device; + struct ib_uobject *uobject; + void *wq_context; + void (*event_handler)(struct ib_event *, void *); + struct ib_pd *pd; + struct ib_cq *cq; + u32 wq_num; + enum ib_wq_state state; + enum ib_wq_type wq_type; + atomic_t usecnt; +}; + +struct ib_wq_init_attr { + void *wq_context; + enum ib_wq_type wq_type; + u32 max_wr; + u32 max_sge; + struct ib_cq *cq; + void (*event_handler)(struct ib_event *, void *); +}; + +enum ib_wq_attr_mask { + IB_WQ_STATE = 1 << 0, + IB_WQ_CUR_STATE = 1 << 1, +}; + +struct ib_wq_attr { + enum ib_wq_state wq_state; + enum ib_wq_state curr_wq_state; +}; + +struct ib_rwq_ind_table { + struct ib_device *device; + struct ib_uobject *uobject; + atomic_t usecnt; + u32 ind_tbl_num; + u32 log_ind_tbl_size; + struct ib_wq **ind_tbl; +}; + +struct ib_rwq_ind_table_init_attr { + u32 log_ind_tbl_size; + /* Each entry is a pointer to Receive Work Queue */ + struct ib_wq **ind_tbl; +}; + +/* + * @max_write_sge: Maximum SGE elements per RDMA WRITE request. + * @max_read_sge: Maximum SGE elements per RDMA READ request. + */ struct ib_qp { struct ib_device *device; struct ib_pd *pd; @@ -1449,7 +1515,10 @@ struct ib_qp { void (*event_handler)(struct ib_event *, void *); void *qp_context; u32 qp_num; + u32 max_write_sge; + u32 max_read_sge; enum ib_qp_type qp_type; + struct ib_rwq_ind_table *rwq_ind_tbl; }; struct ib_mr { @@ -1506,6 +1575,7 @@ enum ib_flow_spec_type { IB_FLOW_SPEC_IB = 0x22, /* L3 header*/ IB_FLOW_SPEC_IPV4 = 0x30, + IB_FLOW_SPEC_IPV6 = 0x31, /* L4 headers*/ IB_FLOW_SPEC_TCP = 0x40, IB_FLOW_SPEC_UDP = 0x41 @@ -1567,6 +1637,18 @@ struct ib_flow_spec_ipv4 { struct ib_flow_ipv4_filter mask; }; +struct ib_flow_ipv6_filter { + u8 src_ip[16]; + u8 dst_ip[16]; +}; + +struct ib_flow_spec_ipv6 { + enum ib_flow_spec_type type; + u16 size; + struct ib_flow_ipv6_filter val; + struct ib_flow_ipv6_filter mask; +}; + struct ib_flow_tcp_udp_filter { __be16 dst_port; __be16 src_port; @@ -1588,6 +1670,7 @@ union ib_flow_spec { struct ib_flow_spec_ib ib; struct ib_flow_spec_ipv4 ipv4; struct ib_flow_spec_tcp_udp tcp_udp; + struct ib_flow_spec_ipv6 ipv6; }; struct ib_flow_attr { @@ -1921,7 +2004,18 @@ struct ib_device { struct ifla_vf_stats *stats); int (*set_vf_guid)(struct ib_device *device, int vf, u8 port, u64 guid, int type); - + struct ib_wq * (*create_wq)(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata); + int (*destroy_wq)(struct ib_wq *wq); + int (*modify_wq)(struct ib_wq *wq, + struct ib_wq_attr *attr, + u32 wq_attr_mask, + struct ib_udata *udata); + struct ib_rwq_ind_table * (*create_rwq_ind_table)(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata); + int (*destroy_rwq_ind_table)(struct ib_rwq_ind_table *wq_ind_table); struct ib_dma_mapping_ops *dma_ops; struct module *owner; @@ -1956,6 +2050,7 @@ struct ib_device { * in fast paths. */ int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *); + void (*get_dev_fw_str)(struct ib_device *, char *str, size_t str_len); }; struct ib_client { @@ -1991,6 +2086,8 @@ struct ib_client { struct ib_device *ib_alloc_device(size_t size); void ib_dealloc_device(struct ib_device *device); +void ib_get_device_fw_str(struct ib_device *device, char *str, size_t str_len); + int ib_register_device(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)); @@ -3168,6 +3265,15 @@ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr); +struct ib_wq *ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr); +int ib_destroy_wq(struct ib_wq *wq); +int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *attr, + u32 wq_attr_mask); +struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr* + wq_ind_table_init_attr); +int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset, unsigned int page_size); diff --git a/include/rdma/opa_port_info.h b/include/rdma/opa_port_info.h index 2b95c2c336eb..9303e0e4f508 100644 --- a/include/rdma/opa_port_info.h +++ b/include/rdma/opa_port_info.h @@ -33,11 +33,6 @@ #if !defined(OPA_PORT_INFO_H) #define OPA_PORT_INFO_H -/* Temporary until HFI driver is updated */ -#ifndef USE_PI_LED_ENABLE -#define USE_PI_LED_ENABLE 0 -#endif - #define OPA_PORT_LINK_MODE_NOP 0 /* No change */ #define OPA_PORT_LINK_MODE_OPA 4 /* Port mode is OPA */ @@ -274,23 +269,12 @@ enum port_info_field_masks { OPA_PI_MASK_MTU_CAP = 0x0F, }; -#if USE_PI_LED_ENABLE struct opa_port_states { u8 reserved; u8 ledenable_offlinereason; /* 1 res, 1 bit, 6 bits */ u8 reserved2; u8 portphysstate_portstate; /* 4 bits, 4 bits */ }; -#define PI_LED_ENABLE_SUP 1 -#else -struct opa_port_states { - u8 reserved; - u8 offline_reason; /* 2 res, 6 bits */ - u8 reserved2; - u8 portphysstate_portstate; /* 4 bits, 4 bits */ -}; -#define PI_LED_ENABLE_SUP 0 -#endif struct opa_port_state_info { struct opa_port_states port_states; diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index afe44fde72a5..81fb1d15e8bb 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -333,11 +333,13 @@ int rdma_disconnect(struct rdma_cm_id *id); * address. * @id: Communication identifier associated with the request. * @addr: Multicast address identifying the group to join. + * @join_state: Multicast JoinState bitmap requested by port. + * Bitmap is based on IB_SA_MCMEMBER_REC_JOIN_STATE bits. * @context: User-defined context associated with the join request, returned * to the user through the private_data pointer in multicast events. */ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, - void *context); + u8 join_state, void *context); /** * rdma_leave_multicast - Leave the multicast group specified by the given diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 9c9a27d42aaa..e31502107a58 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -158,6 +158,7 @@ struct rvt_driver_params { u32 max_mad_size; u8 qos_shift; u8 max_rdma_atomic; + u8 reserved_operations; }; /* Protection domain */ @@ -351,6 +352,9 @@ struct rvt_dev_info { /* Driver specific properties */ struct rvt_driver_params dparms; + /* post send table */ + const struct rvt_operation_params *post_parms; + struct rvt_mregion __rcu *dma_mr; struct rvt_lkey_table lkey_table; @@ -484,6 +488,9 @@ void rvt_unregister_device(struct rvt_dev_info *rvd); int rvt_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr); int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port, int port_index, u16 *pkey_table); +int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key, + int access); +int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey); int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge, u32 len, u64 vaddr, u32 rkey, int acc); int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, diff --git a/include/rdma/rdmavt_mr.h b/include/rdma/rdmavt_mr.h index 5edffdca8c53..6b3c6c8b6b77 100644 --- a/include/rdma/rdmavt_mr.h +++ b/include/rdma/rdmavt_mr.h @@ -81,6 +81,7 @@ struct rvt_mregion { u32 mapsz; /* size of the map array */ u8 page_shift; /* 0 - non unform/non powerof2 sizes */ u8 lkey_published; /* in global table */ + atomic_t lkey_invalid; /* true if current lkey is invalid */ struct completion comp; /* complete when refcount goes to zero */ atomic_t refcount; struct rvt_segarray *map[0]; /* the segments */ diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 6d23b879416a..bd34d0b56bf7 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -145,6 +145,12 @@ (RVT_PROCESS_SEND_OK | RVT_FLUSH_SEND) /* + * Internal send flags + */ +#define RVT_SEND_RESERVE_USED IB_SEND_RESERVED_START +#define RVT_SEND_COMPLETION_ONLY (IB_SEND_RESERVED_START << 1) + +/* * Send work request queue entry. * The size of the sg_list is determined when the QP is created and stored * in qp->s_max_sge. @@ -216,23 +222,43 @@ struct rvt_mmap_info { * to send a RDMA read response or atomic operation. */ struct rvt_ack_entry { - u8 opcode; - u8 sent; + struct rvt_sge rdma_sge; + u64 atomic_data; u32 psn; u32 lpsn; - union { - struct rvt_sge rdma_sge; - u64 atomic_data; - }; + u8 opcode; + u8 sent; }; #define RC_QP_SCALING_INTERVAL 5 -/* - * Variables prefixed with s_ are for the requester (sender). - * Variables prefixed with r_ are for the responder (receiver). - * Variables prefixed with ack_ are for responder replies. +#define RVT_OPERATION_PRIV 0x00000001 +#define RVT_OPERATION_ATOMIC 0x00000002 +#define RVT_OPERATION_ATOMIC_SGE 0x00000004 +#define RVT_OPERATION_LOCAL 0x00000008 +#define RVT_OPERATION_USE_RESERVE 0x00000010 + +#define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1) + +/** + * rvt_operation_params - op table entry + * @length - the length to copy into the swqe entry + * @qpt_support - a bit mask indicating QP type support + * @flags - RVT_OPERATION flags (see above) * + * This supports table driven post send so that + * the driver can have differing an potentially + * different sets of operations. + * + **/ + +struct rvt_operation_params { + size_t length; + u32 qpt_support; + u32 flags; +}; + +/* * Common variables are protected by both r_rq.lock and s_lock in that order * which only happens in modify_qp() or changing the QP 'state'. */ @@ -307,6 +333,7 @@ struct rvt_qp { u32 s_next_psn; /* PSN for next request */ u32 s_avail; /* number of entries avail */ u32 s_ssn; /* SSN of tail entry */ + atomic_t s_reserved_used; /* reserved entries in use */ spinlock_t s_lock ____cacheline_aligned_in_smp; u32 s_flags; @@ -343,6 +370,8 @@ struct rvt_qp { struct rvt_sge_state s_ack_rdma_sge; struct timer_list s_timer; + atomic_t local_ops_pending; /* number of fast_reg/local_inv reqs */ + /* * This sge list MUST be last. Do not add anything below here. */ @@ -436,6 +465,49 @@ static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n) rq->max_sge * sizeof(struct ib_sge)) * n); } +/** + * rvt_qp_wqe_reserve - reserve operation + * @qp - the rvt qp + * @wqe - the send wqe + * + * This routine used in post send to record + * a wqe relative reserved operation use. + */ +static inline void rvt_qp_wqe_reserve( + struct rvt_qp *qp, + struct rvt_swqe *wqe) +{ + wqe->wr.send_flags |= RVT_SEND_RESERVE_USED; + atomic_inc(&qp->s_reserved_used); +} + +/** + * rvt_qp_wqe_unreserve - clean reserved operation + * @qp - the rvt qp + * @wqe - the send wqe + * + * This decrements the reserve use count. + * + * This call MUST precede the change to + * s_last to insure that post send sees a stable + * s_avail. + * + * An smp_mp__after_atomic() is used to insure + * the compiler does not juggle the order of the s_last + * ring index and the decrementing of s_reserved_used. + */ +static inline void rvt_qp_wqe_unreserve( + struct rvt_qp *qp, + struct rvt_swqe *wqe) +{ + if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED)) { + wqe->wr.send_flags &= ~RVT_SEND_RESERVE_USED; + atomic_dec(&qp->s_reserved_used); + /* insure no compiler re-order up to s_last change */ + smp_mb__after_atomic(); + } +} + extern const int ib_rvt_state_ops[]; struct rvt_dev_info; diff --git a/drivers/scsi/ibmvscsi/viosrp.h b/include/scsi/viosrp.h index c1ab8a4c3161..974e07bd8e59 100644 --- a/drivers/scsi/ibmvscsi/viosrp.h +++ b/include/scsi/viosrp.h @@ -15,11 +15,6 @@ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ /* GNU General Public License for more details. */ /* */ -/* You should have received a copy of the GNU General Public License */ -/* along with this program; if not, write to the Free Software */ -/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -/* */ -/* */ /* This file contains structures and definitions for IBM RPA (RS/6000 */ /* platform architecture) implementation of the SRP (SCSI RDMA Protocol) */ /* standard. SRP is used on IBM iSeries and pSeries platforms to send SCSI */ @@ -93,7 +88,7 @@ struct viosrp_crq { }; /* MADs are Management requests above and beyond the IUs defined in the SRP - * standard. + * standard. */ enum viosrp_mad_types { VIOSRP_EMPTY_IU_TYPE = 0x01, @@ -131,7 +126,7 @@ enum viosrp_capability_flag { CAP_LIST_DATA = 0x08, }; -/* +/* * Common MAD header */ struct mad_common { @@ -146,7 +141,7 @@ struct mad_common { * client to the server. There is no way for the server to send * an asynchronous message back to the client. The Empty IU is used * to hang out a meaningless request to the server so that it can respond - * asynchrouously with something like a SCSI AER + * asynchrouously with something like a SCSI AER */ struct viosrp_empty_iu { struct mad_common common; @@ -189,7 +184,7 @@ struct mad_migration_cap { __be32 ecl; }; -struct capabilities{ +struct capabilities { __be32 flags; char name[SRP_MAX_LOC_LEN]; char loc[SRP_MAX_LOC_LEN]; diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h index d8ab5101fad5..f6f3bc52c1ac 100644 --- a/include/target/target_core_backend.h +++ b/include/target/target_core_backend.h @@ -95,6 +95,6 @@ sense_reason_t passthrough_parse_cdb(struct se_cmd *cmd, bool target_sense_desc_format(struct se_device *dev); sector_t target_to_linux_sector(struct se_device *dev, sector_t lb); bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib, - struct request_queue *q, int block_size); + struct request_queue *q); #endif /* TARGET_CORE_BACKEND_H */ diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index b316b44d03f3..fb8e3b6febdf 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -142,6 +142,7 @@ enum se_cmd_flags_table { SCF_PASSTHROUGH_PROT_SG_TO_MEM_NOALLOC = 0x00200000, SCF_ACK_KREF = 0x00400000, SCF_USE_CPUID = 0x00800000, + SCF_TASK_ATTR_SET = 0x01000000, }; /* diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h index de44462a7680..5cd6faa6e0d1 100644 --- a/include/target/target_core_fabric.h +++ b/include/target/target_core_fabric.h @@ -163,7 +163,6 @@ int core_tmr_alloc_req(struct se_cmd *, void *, u8, gfp_t); void core_tmr_release_req(struct se_tmr_req *); int transport_generic_handle_tmr(struct se_cmd *); void transport_generic_request_failure(struct se_cmd *, sense_reason_t); -void __target_execute_cmd(struct se_cmd *); int transport_lookup_tmr_lun(struct se_cmd *, u64); void core_allocate_nexus_loss_ua(struct se_node_acl *acl); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 5b81ef304388..e030d6f6c19a 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -66,6 +66,21 @@ struct btrfs_qgroup_extent_record; { BTRFS_BLOCK_GROUP_RAID6, "RAID6"} #define BTRFS_UUID_SIZE 16 +#define TP_STRUCT__entry_fsid __array(u8, fsid, BTRFS_UUID_SIZE) + +#define TP_fast_assign_fsid(fs_info) \ + memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE) + +#define TP_STRUCT__entry_btrfs(args...) \ + TP_STRUCT__entry( \ + TP_STRUCT__entry_fsid \ + args) +#define TP_fast_assign_btrfs(fs_info, args...) \ + TP_fast_assign( \ + TP_fast_assign_fsid(fs_info); \ + args) +#define TP_printk_btrfs(fmt, args...) \ + TP_printk("%pU: " fmt, __entry->fsid, args) TRACE_EVENT(btrfs_transaction_commit, @@ -73,17 +88,17 @@ TRACE_EVENT(btrfs_transaction_commit, TP_ARGS(root), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( u64, generation ) __field( u64, root_objectid ) ), - TP_fast_assign( + TP_fast_assign_btrfs(root->fs_info, __entry->generation = root->fs_info->generation; __entry->root_objectid = root->root_key.objectid; ), - TP_printk("root = %llu(%s), gen = %llu", + TP_printk_btrfs("root = %llu(%s), gen = %llu", show_root_type(__entry->root_objectid), (unsigned long long)__entry->generation) ); @@ -94,7 +109,7 @@ DECLARE_EVENT_CLASS(btrfs__inode, TP_ARGS(inode), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( ino_t, ino ) __field( blkcnt_t, blocks ) __field( u64, disk_i_size ) @@ -104,7 +119,7 @@ DECLARE_EVENT_CLASS(btrfs__inode, __field( u64, root_objectid ) ), - TP_fast_assign( + TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), __entry->ino = inode->i_ino; __entry->blocks = inode->i_blocks; __entry->disk_i_size = BTRFS_I(inode)->disk_i_size; @@ -115,7 +130,7 @@ DECLARE_EVENT_CLASS(btrfs__inode, BTRFS_I(inode)->root->root_key.objectid; ), - TP_printk("root = %llu(%s), gen = %llu, ino = %lu, blocks = %llu, " + TP_printk_btrfs("root = %llu(%s), gen = %llu, ino = %lu, blocks = %llu, " "disk_i_size = %llu, last_trans = %llu, logged_trans = %llu", show_root_type(__entry->root_objectid), (unsigned long long)__entry->generation, @@ -175,7 +190,7 @@ TRACE_EVENT_CONDITION(btrfs_get_extent, TP_CONDITION(map), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( u64, root_objectid ) __field( u64, start ) __field( u64, len ) @@ -187,7 +202,7 @@ TRACE_EVENT_CONDITION(btrfs_get_extent, __field( unsigned int, compress_type ) ), - TP_fast_assign( + TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = root->root_key.objectid; __entry->start = map->start; __entry->len = map->len; @@ -199,7 +214,7 @@ TRACE_EVENT_CONDITION(btrfs_get_extent, __entry->compress_type = map->compress_type; ), - TP_printk("root = %llu(%s), start = %llu, len = %llu, " + TP_printk_btrfs("root = %llu(%s), start = %llu, len = %llu, " "orig_start = %llu, block_start = %llu(%s), " "block_len = %llu, flags = %s, refs = %u, " "compress_type = %u", @@ -233,7 +248,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, TP_ARGS(inode, ordered), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( ino_t, ino ) __field( u64, file_offset ) __field( u64, start ) @@ -246,7 +261,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, __field( u64, root_objectid ) ), - TP_fast_assign( + TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), __entry->ino = inode->i_ino; __entry->file_offset = ordered->file_offset; __entry->start = ordered->start; @@ -260,7 +275,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, BTRFS_I(inode)->root->root_key.objectid; ), - TP_printk("root = %llu(%s), ino = %llu, file_offset = %llu, " + TP_printk_btrfs("root = %llu(%s), ino = %llu, file_offset = %llu, " "start = %llu, len = %llu, disk_len = %llu, " "bytes_left = %llu, flags = %s, compress_type = %d, " "refs = %d", @@ -310,7 +325,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage, TP_ARGS(page, inode, wbc), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( ino_t, ino ) __field( pgoff_t, index ) __field( long, nr_to_write ) @@ -324,7 +339,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage, __field( u64, root_objectid ) ), - TP_fast_assign( + TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), __entry->ino = inode->i_ino; __entry->index = page->index; __entry->nr_to_write = wbc->nr_to_write; @@ -339,7 +354,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage, BTRFS_I(inode)->root->root_key.objectid; ), - TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, " + TP_printk_btrfs("root = %llu(%s), ino = %lu, page_index = %lu, " "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, " "range_end = %llu, for_kupdate = %d, " "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu", @@ -366,7 +381,7 @@ TRACE_EVENT(btrfs_writepage_end_io_hook, TP_ARGS(page, start, end, uptodate), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( ino_t, ino ) __field( pgoff_t, index ) __field( u64, start ) @@ -375,7 +390,7 @@ TRACE_EVENT(btrfs_writepage_end_io_hook, __field( u64, root_objectid ) ), - TP_fast_assign( + TP_fast_assign_btrfs(btrfs_sb(page->mapping->host->i_sb), __entry->ino = page->mapping->host->i_ino; __entry->index = page->index; __entry->start = start; @@ -385,7 +400,7 @@ TRACE_EVENT(btrfs_writepage_end_io_hook, BTRFS_I(page->mapping->host)->root->root_key.objectid; ), - TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, start = %llu, " + TP_printk_btrfs("root = %llu(%s), ino = %lu, page_index = %lu, start = %llu, " "end = %llu, uptodate = %d", show_root_type(__entry->root_objectid), (unsigned long)__entry->ino, (unsigned long)__entry->index, @@ -399,7 +414,7 @@ TRACE_EVENT(btrfs_sync_file, TP_ARGS(file, datasync), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( ino_t, ino ) __field( ino_t, parent ) __field( int, datasync ) @@ -410,6 +425,7 @@ TRACE_EVENT(btrfs_sync_file, struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); + TP_fast_assign_fsid(btrfs_sb(file->f_path.dentry->d_sb)); __entry->ino = inode->i_ino; __entry->parent = d_inode(dentry->d_parent)->i_ino; __entry->datasync = datasync; @@ -417,7 +433,7 @@ TRACE_EVENT(btrfs_sync_file, BTRFS_I(inode)->root->root_key.objectid; ), - TP_printk("root = %llu(%s), ino = %ld, parent = %ld, datasync = %d", + TP_printk_btrfs("root = %llu(%s), ino = %ld, parent = %ld, datasync = %d", show_root_type(__entry->root_objectid), (unsigned long)__entry->ino, (unsigned long)__entry->parent, __entry->datasync) @@ -425,19 +441,19 @@ TRACE_EVENT(btrfs_sync_file, TRACE_EVENT(btrfs_sync_fs, - TP_PROTO(int wait), + TP_PROTO(struct btrfs_fs_info *fs_info, int wait), - TP_ARGS(wait), + TP_ARGS(fs_info, wait), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( int, wait ) ), - TP_fast_assign( + TP_fast_assign_btrfs(fs_info, __entry->wait = wait; ), - TP_printk("wait = %d", __entry->wait) + TP_printk_btrfs("wait = %d", __entry->wait) ); TRACE_EVENT(btrfs_add_block_group, @@ -490,13 +506,14 @@ TRACE_EVENT(btrfs_add_block_group, DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref, - TP_PROTO(struct btrfs_delayed_ref_node *ref, + TP_PROTO(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_node *ref, struct btrfs_delayed_tree_ref *full_ref, int action), - TP_ARGS(ref, full_ref, action), + TP_ARGS(fs_info, ref, full_ref, action), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( u64, bytenr ) __field( u64, num_bytes ) __field( int, action ) @@ -507,7 +524,7 @@ DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref, __field( u64, seq ) ), - TP_fast_assign( + TP_fast_assign_btrfs(fs_info, __entry->bytenr = ref->bytenr; __entry->num_bytes = ref->num_bytes; __entry->action = action; @@ -518,7 +535,7 @@ DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref, __entry->seq = ref->seq; ), - TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, " + TP_printk_btrfs("bytenr = %llu, num_bytes = %llu, action = %s, " "parent = %llu(%s), ref_root = %llu(%s), level = %d, " "type = %s, seq = %llu", (unsigned long long)__entry->bytenr, @@ -532,31 +549,34 @@ DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref, DEFINE_EVENT(btrfs_delayed_tree_ref, add_delayed_tree_ref, - TP_PROTO(struct btrfs_delayed_ref_node *ref, + TP_PROTO(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_node *ref, struct btrfs_delayed_tree_ref *full_ref, int action), - TP_ARGS(ref, full_ref, action) + TP_ARGS(fs_info, ref, full_ref, action) ); DEFINE_EVENT(btrfs_delayed_tree_ref, run_delayed_tree_ref, - TP_PROTO(struct btrfs_delayed_ref_node *ref, + TP_PROTO(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_node *ref, struct btrfs_delayed_tree_ref *full_ref, int action), - TP_ARGS(ref, full_ref, action) + TP_ARGS(fs_info, ref, full_ref, action) ); DECLARE_EVENT_CLASS(btrfs_delayed_data_ref, - TP_PROTO(struct btrfs_delayed_ref_node *ref, + TP_PROTO(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_node *ref, struct btrfs_delayed_data_ref *full_ref, int action), - TP_ARGS(ref, full_ref, action), + TP_ARGS(fs_info, ref, full_ref, action), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( u64, bytenr ) __field( u64, num_bytes ) __field( int, action ) @@ -568,7 +588,7 @@ DECLARE_EVENT_CLASS(btrfs_delayed_data_ref, __field( u64, seq ) ), - TP_fast_assign( + TP_fast_assign_btrfs(fs_info, __entry->bytenr = ref->bytenr; __entry->num_bytes = ref->num_bytes; __entry->action = action; @@ -580,7 +600,7 @@ DECLARE_EVENT_CLASS(btrfs_delayed_data_ref, __entry->seq = ref->seq; ), - TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, " + TP_printk_btrfs("bytenr = %llu, num_bytes = %llu, action = %s, " "parent = %llu(%s), ref_root = %llu(%s), owner = %llu, " "offset = %llu, type = %s, seq = %llu", (unsigned long long)__entry->bytenr, @@ -596,45 +616,48 @@ DECLARE_EVENT_CLASS(btrfs_delayed_data_ref, DEFINE_EVENT(btrfs_delayed_data_ref, add_delayed_data_ref, - TP_PROTO(struct btrfs_delayed_ref_node *ref, + TP_PROTO(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_node *ref, struct btrfs_delayed_data_ref *full_ref, int action), - TP_ARGS(ref, full_ref, action) + TP_ARGS(fs_info, ref, full_ref, action) ); DEFINE_EVENT(btrfs_delayed_data_ref, run_delayed_data_ref, - TP_PROTO(struct btrfs_delayed_ref_node *ref, + TP_PROTO(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_node *ref, struct btrfs_delayed_data_ref *full_ref, int action), - TP_ARGS(ref, full_ref, action) + TP_ARGS(fs_info, ref, full_ref, action) ); DECLARE_EVENT_CLASS(btrfs_delayed_ref_head, - TP_PROTO(struct btrfs_delayed_ref_node *ref, + TP_PROTO(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_node *ref, struct btrfs_delayed_ref_head *head_ref, int action), - TP_ARGS(ref, head_ref, action), + TP_ARGS(fs_info, ref, head_ref, action), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( u64, bytenr ) __field( u64, num_bytes ) __field( int, action ) __field( int, is_data ) ), - TP_fast_assign( + TP_fast_assign_btrfs(fs_info, __entry->bytenr = ref->bytenr; __entry->num_bytes = ref->num_bytes; __entry->action = action; __entry->is_data = head_ref->is_data; ), - TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, is_data = %d", + TP_printk_btrfs("bytenr = %llu, num_bytes = %llu, action = %s, is_data = %d", (unsigned long long)__entry->bytenr, (unsigned long long)__entry->num_bytes, show_ref_action(__entry->action), @@ -643,20 +666,22 @@ DECLARE_EVENT_CLASS(btrfs_delayed_ref_head, DEFINE_EVENT(btrfs_delayed_ref_head, add_delayed_ref_head, - TP_PROTO(struct btrfs_delayed_ref_node *ref, + TP_PROTO(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_node *ref, struct btrfs_delayed_ref_head *head_ref, int action), - TP_ARGS(ref, head_ref, action) + TP_ARGS(fs_info, ref, head_ref, action) ); DEFINE_EVENT(btrfs_delayed_ref_head, run_delayed_ref_head, - TP_PROTO(struct btrfs_delayed_ref_node *ref, + TP_PROTO(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_node *ref, struct btrfs_delayed_ref_head *head_ref, int action), - TP_ARGS(ref, head_ref, action) + TP_ARGS(fs_info, ref, head_ref, action) ); #define show_chunk_type(type) \ @@ -678,7 +703,7 @@ DECLARE_EVENT_CLASS(btrfs__chunk, TP_ARGS(root, map, offset, size), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( int, num_stripes ) __field( u64, type ) __field( int, sub_stripes ) @@ -687,7 +712,7 @@ DECLARE_EVENT_CLASS(btrfs__chunk, __field( u64, root_objectid ) ), - TP_fast_assign( + TP_fast_assign_btrfs(root->fs_info, __entry->num_stripes = map->num_stripes; __entry->type = map->type; __entry->sub_stripes = map->sub_stripes; @@ -696,7 +721,7 @@ DECLARE_EVENT_CLASS(btrfs__chunk, __entry->root_objectid = root->root_key.objectid; ), - TP_printk("root = %llu(%s), offset = %llu, size = %llu, " + TP_printk_btrfs("root = %llu(%s), offset = %llu, size = %llu, " "num_stripes = %d, sub_stripes = %d, type = %s", show_root_type(__entry->root_objectid), (unsigned long long)__entry->offset, @@ -728,7 +753,7 @@ TRACE_EVENT(btrfs_cow_block, TP_ARGS(root, buf, cow), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( u64, root_objectid ) __field( u64, buf_start ) __field( int, refs ) @@ -737,7 +762,7 @@ TRACE_EVENT(btrfs_cow_block, __field( int, cow_level ) ), - TP_fast_assign( + TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = root->root_key.objectid; __entry->buf_start = buf->start; __entry->refs = atomic_read(&buf->refs); @@ -746,7 +771,7 @@ TRACE_EVENT(btrfs_cow_block, __entry->cow_level = btrfs_header_level(cow); ), - TP_printk("root = %llu(%s), refs = %d, orig_buf = %llu " + TP_printk_btrfs("root = %llu(%s), refs = %d, orig_buf = %llu " "(orig_level = %d), cow_buf = %llu (cow_level = %d)", show_root_type(__entry->root_objectid), __entry->refs, @@ -763,25 +788,23 @@ TRACE_EVENT(btrfs_space_reservation, TP_ARGS(fs_info, type, val, bytes, reserve), - TP_STRUCT__entry( - __array( u8, fsid, BTRFS_UUID_SIZE ) + TP_STRUCT__entry_btrfs( __string( type, type ) __field( u64, val ) __field( u64, bytes ) __field( int, reserve ) ), - TP_fast_assign( - memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE); + TP_fast_assign_btrfs(fs_info, __assign_str(type, type); __entry->val = val; __entry->bytes = bytes; __entry->reserve = reserve; ), - TP_printk("%pU: %s: %Lu %s %Lu", __entry->fsid, __get_str(type), - __entry->val, __entry->reserve ? "reserve" : "release", - __entry->bytes) + TP_printk_btrfs("%s: %Lu %s %Lu", __get_str(type), __entry->val, + __entry->reserve ? "reserve" : "release", + __entry->bytes) ); #define show_flush_action(action) \ @@ -872,22 +895,19 @@ DECLARE_EVENT_CLASS(btrfs__reserved_extent, TP_ARGS(root, start, len), - TP_STRUCT__entry( - __array( u8, fsid, BTRFS_UUID_SIZE ) - __field( u64, root_objectid ) - __field( u64, start ) - __field( u64, len ) + TP_STRUCT__entry_btrfs( + __field( u64, root_objectid ) + __field( u64, start ) + __field( u64, len ) ), - TP_fast_assign( - memcpy(__entry->fsid, root->fs_info->fsid, BTRFS_UUID_SIZE); + TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = root->root_key.objectid; __entry->start = start; __entry->len = len; ), - TP_printk("%pU: root = %llu(%s), start = %llu, len = %llu", - __entry->fsid, + TP_printk_btrfs("root = %llu(%s), start = %llu, len = %llu", show_root_type(__entry->root_objectid), (unsigned long long)__entry->start, (unsigned long long)__entry->len) @@ -914,21 +934,21 @@ TRACE_EVENT(find_free_extent, TP_ARGS(root, num_bytes, empty_size, data), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( u64, root_objectid ) __field( u64, num_bytes ) __field( u64, empty_size ) __field( u64, data ) ), - TP_fast_assign( + TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = root->root_key.objectid; __entry->num_bytes = num_bytes; __entry->empty_size = empty_size; __entry->data = data; ), - TP_printk("root = %Lu(%s), len = %Lu, empty_size = %Lu, " + TP_printk_btrfs("root = %Lu(%s), len = %Lu, empty_size = %Lu, " "flags = %Lu(%s)", show_root_type(__entry->root_objectid), __entry->num_bytes, __entry->empty_size, __entry->data, __print_flags((unsigned long)__entry->data, "|", @@ -943,8 +963,7 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent, TP_ARGS(root, block_group, start, len), - TP_STRUCT__entry( - __array( u8, fsid, BTRFS_UUID_SIZE ) + TP_STRUCT__entry_btrfs( __field( u64, root_objectid ) __field( u64, bg_objectid ) __field( u64, flags ) @@ -952,8 +971,7 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent, __field( u64, len ) ), - TP_fast_assign( - memcpy(__entry->fsid, root->fs_info->fsid, BTRFS_UUID_SIZE); + TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = root->root_key.objectid; __entry->bg_objectid = block_group->key.objectid; __entry->flags = block_group->flags; @@ -961,8 +979,8 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent, __entry->len = len; ), - TP_printk("%pU: root = %Lu(%s), block_group = %Lu, flags = %Lu(%s), " - "start = %Lu, len = %Lu", __entry->fsid, + TP_printk_btrfs("root = %Lu(%s), block_group = %Lu, flags = %Lu(%s), " + "start = %Lu, len = %Lu", show_root_type(__entry->root_objectid), __entry->bg_objectid, __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), @@ -994,7 +1012,7 @@ TRACE_EVENT(btrfs_find_cluster, TP_ARGS(block_group, start, bytes, empty_size, min_bytes), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( u64, bg_objectid ) __field( u64, flags ) __field( u64, start ) @@ -1003,7 +1021,7 @@ TRACE_EVENT(btrfs_find_cluster, __field( u64, min_bytes ) ), - TP_fast_assign( + TP_fast_assign_btrfs(block_group->fs_info, __entry->bg_objectid = block_group->key.objectid; __entry->flags = block_group->flags; __entry->start = start; @@ -1012,7 +1030,7 @@ TRACE_EVENT(btrfs_find_cluster, __entry->min_bytes = min_bytes; ), - TP_printk("block_group = %Lu, flags = %Lu(%s), start = %Lu, len = %Lu," + TP_printk_btrfs("block_group = %Lu, flags = %Lu(%s), start = %Lu, len = %Lu," " empty_size = %Lu, min_bytes = %Lu", __entry->bg_objectid, __entry->flags, __print_flags((unsigned long)__entry->flags, "|", @@ -1026,15 +1044,15 @@ TRACE_EVENT(btrfs_failed_cluster_setup, TP_ARGS(block_group), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( u64, bg_objectid ) ), - TP_fast_assign( + TP_fast_assign_btrfs(block_group->fs_info, __entry->bg_objectid = block_group->key.objectid; ), - TP_printk("block_group = %Lu", __entry->bg_objectid) + TP_printk_btrfs("block_group = %Lu", __entry->bg_objectid) ); TRACE_EVENT(btrfs_setup_cluster, @@ -1044,7 +1062,7 @@ TRACE_EVENT(btrfs_setup_cluster, TP_ARGS(block_group, cluster, size, bitmap), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( u64, bg_objectid ) __field( u64, flags ) __field( u64, start ) @@ -1053,7 +1071,7 @@ TRACE_EVENT(btrfs_setup_cluster, __field( int, bitmap ) ), - TP_fast_assign( + TP_fast_assign_btrfs(block_group->fs_info, __entry->bg_objectid = block_group->key.objectid; __entry->flags = block_group->flags; __entry->start = cluster->window_start; @@ -1062,7 +1080,7 @@ TRACE_EVENT(btrfs_setup_cluster, __entry->bitmap = bitmap; ), - TP_printk("block_group = %Lu, flags = %Lu(%s), window_start = %Lu, " + TP_printk_btrfs("block_group = %Lu, flags = %Lu(%s), window_start = %Lu, " "size = %Lu, max_size = %Lu, bitmap = %d", __entry->bg_objectid, __entry->flags, @@ -1120,7 +1138,7 @@ DECLARE_EVENT_CLASS(btrfs__work, TP_ARGS(work), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( void *, work ) __field( void *, wq ) __field( void *, func ) @@ -1129,7 +1147,7 @@ DECLARE_EVENT_CLASS(btrfs__work, __field( void *, normal_work ) ), - TP_fast_assign( + TP_fast_assign_btrfs(btrfs_work_owner(work), __entry->work = work; __entry->wq = work->wq; __entry->func = work->func; @@ -1138,7 +1156,7 @@ DECLARE_EVENT_CLASS(btrfs__work, __entry->normal_work = &work->normal_work; ), - TP_printk("work=%p (normal_work=%p), wq=%p, func=%pf, ordered_func=%p," + TP_printk_btrfs("work=%p (normal_work=%p), wq=%p, func=%pf, ordered_func=%p," " ordered_free=%p", __entry->work, __entry->normal_work, __entry->wq, __entry->func, __entry->ordered_func, __entry->ordered_free) @@ -1151,15 +1169,15 @@ DECLARE_EVENT_CLASS(btrfs__work__done, TP_ARGS(work), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( void *, work ) ), - TP_fast_assign( + TP_fast_assign_btrfs(btrfs_work_owner(work), __entry->work = work; ), - TP_printk("work->%p", __entry->work) + TP_printk_btrfs("work->%p", __entry->work) ); DEFINE_EVENT(btrfs__work, btrfs_work_queued, @@ -1196,19 +1214,19 @@ DECLARE_EVENT_CLASS(btrfs__workqueue, TP_ARGS(wq, name, high), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( void *, wq ) __string( name, name ) __field( int , high ) ), - TP_fast_assign( + TP_fast_assign_btrfs(btrfs_workqueue_owner(wq), __entry->wq = wq; __assign_str(name, name); __entry->high = high; ), - TP_printk("name=%s%s, wq=%p", __get_str(name), + TP_printk_btrfs("name=%s%s, wq=%p", __get_str(name), __print_flags(__entry->high, "", {(WQ_HIGHPRI), "-high"}), __entry->wq) @@ -1227,15 +1245,15 @@ DECLARE_EVENT_CLASS(btrfs__workqueue_done, TP_ARGS(wq), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( void *, wq ) ), - TP_fast_assign( + TP_fast_assign_btrfs(btrfs_workqueue_owner(wq), __entry->wq = wq; ), - TP_printk("wq=%p", __entry->wq) + TP_printk_btrfs("wq=%p", __entry->wq) ); DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy, @@ -1251,19 +1269,19 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_data_map, TP_ARGS(inode, free_reserved), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( u64, rootid ) __field( unsigned long, ino ) __field( u64, free_reserved ) ), - TP_fast_assign( + TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), __entry->rootid = BTRFS_I(inode)->root->objectid; __entry->ino = inode->i_ino; __entry->free_reserved = free_reserved; ), - TP_printk("rootid=%llu, ino=%lu, free_reserved=%llu", + TP_printk_btrfs("rootid=%llu, ino=%lu, free_reserved=%llu", __entry->rootid, __entry->ino, __entry->free_reserved) ); @@ -1292,7 +1310,7 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data, TP_ARGS(inode, start, len, reserved, op), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( u64, rootid ) __field( unsigned long, ino ) __field( u64, start ) @@ -1301,7 +1319,7 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data, __field( int, op ) ), - TP_fast_assign( + TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), __entry->rootid = BTRFS_I(inode)->root->objectid; __entry->ino = inode->i_ino; __entry->start = start; @@ -1310,7 +1328,7 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data, __entry->op = op; ), - TP_printk("root=%llu, ino=%lu, start=%llu, len=%llu, reserved=%llu, op=%s", + TP_printk_btrfs("root=%llu, ino=%lu, start=%llu, len=%llu, reserved=%llu, op=%s", __entry->rootid, __entry->ino, __entry->start, __entry->len, __entry->reserved, __print_flags((unsigned long)__entry->op, "", @@ -1334,86 +1352,90 @@ DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data, DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref, - TP_PROTO(u64 ref_root, u64 reserved), + TP_PROTO(struct btrfs_fs_info *fs_info, u64 ref_root, u64 reserved), - TP_ARGS(ref_root, reserved), + TP_ARGS(fs_info, ref_root, reserved), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( u64, ref_root ) __field( u64, reserved ) ), - TP_fast_assign( + TP_fast_assign_btrfs(fs_info, __entry->ref_root = ref_root; __entry->reserved = reserved; ), - TP_printk("root=%llu, reserved=%llu, op=free", + TP_printk_btrfs("root=%llu, reserved=%llu, op=free", __entry->ref_root, __entry->reserved) ); DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref, - TP_PROTO(u64 ref_root, u64 reserved), + TP_PROTO(struct btrfs_fs_info *fs_info, u64 ref_root, u64 reserved), - TP_ARGS(ref_root, reserved) + TP_ARGS(fs_info, ref_root, reserved) ); DECLARE_EVENT_CLASS(btrfs_qgroup_extent, - TP_PROTO(struct btrfs_qgroup_extent_record *rec), + TP_PROTO(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_extent_record *rec), - TP_ARGS(rec), + TP_ARGS(fs_info, rec), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( u64, bytenr ) __field( u64, num_bytes ) ), - TP_fast_assign( + TP_fast_assign_btrfs(fs_info, __entry->bytenr = rec->bytenr, __entry->num_bytes = rec->num_bytes; ), - TP_printk("bytenr = %llu, num_bytes = %llu", + TP_printk_btrfs("bytenr = %llu, num_bytes = %llu", (unsigned long long)__entry->bytenr, (unsigned long long)__entry->num_bytes) ); DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents, - TP_PROTO(struct btrfs_qgroup_extent_record *rec), + TP_PROTO(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_extent_record *rec), - TP_ARGS(rec) + TP_ARGS(fs_info, rec) ); DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_insert_dirty_extent, - TP_PROTO(struct btrfs_qgroup_extent_record *rec), + TP_PROTO(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_extent_record *rec), - TP_ARGS(rec) + TP_ARGS(fs_info, rec) ); TRACE_EVENT(btrfs_qgroup_account_extent, - TP_PROTO(u64 bytenr, u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots), + TP_PROTO(struct btrfs_fs_info *fs_info, u64 bytenr, + u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots), - TP_ARGS(bytenr, num_bytes, nr_old_roots, nr_new_roots), + TP_ARGS(fs_info, bytenr, num_bytes, nr_old_roots, nr_new_roots), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( u64, bytenr ) __field( u64, num_bytes ) __field( u64, nr_old_roots ) __field( u64, nr_new_roots ) ), - TP_fast_assign( + TP_fast_assign_btrfs(fs_info, __entry->bytenr = bytenr; __entry->num_bytes = num_bytes; __entry->nr_old_roots = nr_old_roots; __entry->nr_new_roots = nr_new_roots; ), - TP_printk("bytenr = %llu, num_bytes = %llu, nr_old_roots = %llu, " + TP_printk_btrfs("bytenr = %llu, num_bytes = %llu, nr_old_roots = %llu, " "nr_new_roots = %llu", __entry->bytenr, __entry->num_bytes, @@ -1423,23 +1445,24 @@ TRACE_EVENT(btrfs_qgroup_account_extent, TRACE_EVENT(qgroup_update_counters, - TP_PROTO(u64 qgid, u64 cur_old_count, u64 cur_new_count), + TP_PROTO(struct btrfs_fs_info *fs_info, u64 qgid, + u64 cur_old_count, u64 cur_new_count), - TP_ARGS(qgid, cur_old_count, cur_new_count), + TP_ARGS(fs_info, qgid, cur_old_count, cur_new_count), - TP_STRUCT__entry( + TP_STRUCT__entry_btrfs( __field( u64, qgid ) __field( u64, cur_old_count ) __field( u64, cur_new_count ) ), - TP_fast_assign( + TP_fast_assign_btrfs(fs_info, __entry->qgid = qgid; __entry->cur_old_count = cur_old_count; __entry->cur_new_count = cur_new_count; ), - TP_printk("qgid = %llu, cur_old_count = %llu, cur_new_count = %llu", + TP_printk_btrfs("qgid = %llu, cur_old_count = %llu, cur_new_count = %llu", __entry->qgid, __entry->cur_old_count, __entry->cur_new_count) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 003dca933803..8a707f8a41c3 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -473,6 +473,39 @@ TRACE_EVENT(svc_recv, show_rqstp_flags(__entry->flags)) ); +DECLARE_EVENT_CLASS(svc_rqst_event, + + TP_PROTO(struct svc_rqst *rqst), + + TP_ARGS(rqst), + + TP_STRUCT__entry( + __field(__be32, xid) + __field(unsigned long, flags) + __dynamic_array(unsigned char, addr, rqst->rq_addrlen) + ), + + TP_fast_assign( + __entry->xid = rqst->rq_xid; + __entry->flags = rqst->rq_flags; + memcpy(__get_dynamic_array(addr), + &rqst->rq_addr, rqst->rq_addrlen); + ), + + TP_printk("addr=%pIScp rq_xid=0x%x flags=%s", + (struct sockaddr *)__get_dynamic_array(addr), + be32_to_cpu(__entry->xid), + show_rqstp_flags(__entry->flags)) +); + +DEFINE_EVENT(svc_rqst_event, svc_defer, + TP_PROTO(struct svc_rqst *rqst), + TP_ARGS(rqst)); + +DEFINE_EVENT(svc_rqst_event, svc_drop, + TP_PROTO(struct svc_rqst *rqst), + TP_ARGS(rqst)); + DECLARE_EVENT_CLASS(svc_rqst_status, TP_PROTO(struct svc_rqst *rqst, int status), @@ -529,45 +562,67 @@ TRACE_EVENT(svc_xprt_do_enqueue, TP_STRUCT__entry( __field(struct svc_xprt *, xprt) - __field_struct(struct sockaddr_storage, ss) __field(int, pid) __field(unsigned long, flags) + __dynamic_array(unsigned char, addr, xprt != NULL ? + xprt->xpt_remotelen : 0) ), TP_fast_assign( __entry->xprt = xprt; - xprt ? memcpy(&__entry->ss, &xprt->xpt_remote, sizeof(__entry->ss)) : memset(&__entry->ss, 0, sizeof(__entry->ss)); __entry->pid = rqst? rqst->rq_task->pid : 0; - __entry->flags = xprt ? xprt->xpt_flags : 0; + if (xprt) { + memcpy(__get_dynamic_array(addr), + &xprt->xpt_remote, + xprt->xpt_remotelen); + __entry->flags = xprt->xpt_flags; + } else + __entry->flags = 0; ), TP_printk("xprt=0x%p addr=%pIScp pid=%d flags=%s", __entry->xprt, - (struct sockaddr *)&__entry->ss, + __get_dynamic_array_len(addr) != 0 ? + (struct sockaddr *)__get_dynamic_array(addr) : NULL, __entry->pid, show_svc_xprt_flags(__entry->flags)) ); -TRACE_EVENT(svc_xprt_dequeue, +DECLARE_EVENT_CLASS(svc_xprt_event, TP_PROTO(struct svc_xprt *xprt), TP_ARGS(xprt), TP_STRUCT__entry( __field(struct svc_xprt *, xprt) - __field_struct(struct sockaddr_storage, ss) __field(unsigned long, flags) + __dynamic_array(unsigned char, addr, xprt != NULL ? + xprt->xpt_remotelen : 0) ), TP_fast_assign( - __entry->xprt = xprt, - xprt ? memcpy(&__entry->ss, &xprt->xpt_remote, sizeof(__entry->ss)) : memset(&__entry->ss, 0, sizeof(__entry->ss)); - __entry->flags = xprt ? xprt->xpt_flags : 0; + __entry->xprt = xprt; + if (xprt) { + memcpy(__get_dynamic_array(addr), + &xprt->xpt_remote, + xprt->xpt_remotelen); + __entry->flags = xprt->xpt_flags; + } else + __entry->flags = 0; ), TP_printk("xprt=0x%p addr=%pIScp flags=%s", __entry->xprt, - (struct sockaddr *)&__entry->ss, + __get_dynamic_array_len(addr) != 0 ? + (struct sockaddr *)__get_dynamic_array(addr) : NULL, show_svc_xprt_flags(__entry->flags)) ); +DEFINE_EVENT(svc_xprt_event, svc_xprt_dequeue, + TP_PROTO(struct svc_xprt *xprt), + TP_ARGS(xprt)); + +DEFINE_EVENT(svc_xprt_event, svc_xprt_no_write_space, + TP_PROTO(struct svc_xprt *xprt), + TP_ARGS(xprt)); + TRACE_EVENT(svc_wake_up, TP_PROTO(int pid), @@ -592,21 +647,56 @@ TRACE_EVENT(svc_handle_xprt, TP_STRUCT__entry( __field(struct svc_xprt *, xprt) __field(int, len) - __field_struct(struct sockaddr_storage, ss) __field(unsigned long, flags) + __dynamic_array(unsigned char, addr, xprt != NULL ? + xprt->xpt_remotelen : 0) ), TP_fast_assign( __entry->xprt = xprt; - xprt ? memcpy(&__entry->ss, &xprt->xpt_remote, sizeof(__entry->ss)) : memset(&__entry->ss, 0, sizeof(__entry->ss)); __entry->len = len; - __entry->flags = xprt ? xprt->xpt_flags : 0; + if (xprt) { + memcpy(__get_dynamic_array(addr), + &xprt->xpt_remote, + xprt->xpt_remotelen); + __entry->flags = xprt->xpt_flags; + } else + __entry->flags = 0; ), TP_printk("xprt=0x%p addr=%pIScp len=%d flags=%s", __entry->xprt, - (struct sockaddr *)&__entry->ss, + __get_dynamic_array_len(addr) != 0 ? + (struct sockaddr *)__get_dynamic_array(addr) : NULL, __entry->len, show_svc_xprt_flags(__entry->flags)) ); + + +DECLARE_EVENT_CLASS(svc_deferred_event, + TP_PROTO(struct svc_deferred_req *dr), + + TP_ARGS(dr), + + TP_STRUCT__entry( + __field(__be32, xid) + __dynamic_array(unsigned char, addr, dr->addrlen) + ), + + TP_fast_assign( + __entry->xid = *(__be32 *)(dr->args + (dr->xprt_hlen>>2)); + memcpy(__get_dynamic_array(addr), &dr->addr, dr->addrlen); + ), + + TP_printk("addr=%pIScp xid=0x%x", + (struct sockaddr *)__get_dynamic_array(addr), + be32_to_cpu(__entry->xid)) +); + +DEFINE_EVENT(svc_deferred_event, svc_drop_deferred, + TP_PROTO(struct svc_deferred_req *dr), + TP_ARGS(dr)); +DEFINE_EVENT(svc_deferred_event, svc_revisit_deferred, + TP_PROTO(struct svc_deferred_req *dr), + TP_ARGS(dr)); #endif /* _TRACE_SUNRPC_H */ #include <trace/define_trace.h> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 2bdd1e3e7007..ac5eacd3055b 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -798,7 +798,7 @@ static inline char *btrfs_err_str(enum btrfs_err_code err_code) #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ - struct btrfs_ioctl_ino_path_args) + struct btrfs_ioctl_logical_ino_args) #define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \ struct btrfs_ioctl_received_subvol_args) #define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args) diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index 70b172ba41ce..b59ee077a596 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -382,6 +382,19 @@ typedef struct elf64_shdr { #define NT_PPC_VMX 0x100 /* PowerPC Altivec/VMX registers */ #define NT_PPC_SPE 0x101 /* PowerPC SPE/EVR registers */ #define NT_PPC_VSX 0x102 /* PowerPC VSX registers */ +#define NT_PPC_TAR 0x103 /* Target Address Register */ +#define NT_PPC_PPR 0x104 /* Program Priority Register */ +#define NT_PPC_DSCR 0x105 /* Data Stream Control Register */ +#define NT_PPC_EBB 0x106 /* Event Based Branch Registers */ +#define NT_PPC_PMU 0x107 /* Performance Monitor Registers */ +#define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ +#define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ +#define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ +#define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ +#define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ +#define NT_PPC_TM_CTAR 0x10d /* TM checkpointed Target Address Register */ +#define NT_PPC_TM_CPPR 0x10e /* TM checkpointed Program Priority Register */ +#define NT_PPC_TM_CDSCR 0x10f /* TM checkpointed Data Stream Control Register */ #define NT_386_TLS 0x200 /* i386 TLS slots (struct user_desc) */ #define NT_386_IOPERM 0x201 /* x86 io permission bitmap (1=deny) */ #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ diff --git a/include/uapi/rdma/Kbuild b/include/uapi/rdma/Kbuild index 231901b08f6c..4edb0f2b4f9f 100644 --- a/include/uapi/rdma/Kbuild +++ b/include/uapi/rdma/Kbuild @@ -6,3 +6,4 @@ header-y += ib_user_verbs.h header-y += rdma_netlink.h header-y += rdma_user_cm.h header-y += hfi/ +header-y += rdma_user_rxe.h diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h index 98bebf8bef55..d15e7289d835 100644 --- a/include/uapi/rdma/hfi/hfi1_user.h +++ b/include/uapi/rdma/hfi/hfi1_user.h @@ -75,7 +75,7 @@ * may not be implemented; the user code must deal with this if it * cares, or it must abort after initialization reports the difference. */ -#define HFI1_USER_SWMINOR 1 +#define HFI1_USER_SWMINOR 2 /* * We will encode the major/minor inside a single 32bit version number. diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index b6543d73d20a..7f035f4b53b0 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -95,6 +95,11 @@ enum { IB_USER_VERBS_EX_CMD_CREATE_QP = IB_USER_VERBS_CMD_CREATE_QP, IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, IB_USER_VERBS_EX_CMD_DESTROY_FLOW, + IB_USER_VERBS_EX_CMD_CREATE_WQ, + IB_USER_VERBS_EX_CMD_MODIFY_WQ, + IB_USER_VERBS_EX_CMD_DESTROY_WQ, + IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL, + IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL }; /* @@ -518,6 +523,14 @@ struct ib_uverbs_create_qp { __u64 driver_data[0]; }; +enum ib_uverbs_create_qp_mask { + IB_UVERBS_CREATE_QP_MASK_IND_TABLE = 1UL << 0, +}; + +enum { + IB_UVERBS_CREATE_QP_SUP_COMP_MASK = IB_UVERBS_CREATE_QP_MASK_IND_TABLE, +}; + struct ib_uverbs_ex_create_qp { __u64 user_handle; __u32 pd_handle; @@ -535,6 +548,8 @@ struct ib_uverbs_ex_create_qp { __u8 reserved; __u32 comp_mask; __u32 create_flags; + __u32 rwq_ind_tbl_handle; + __u32 reserved1; }; struct ib_uverbs_open_qp { @@ -852,6 +867,24 @@ struct ib_uverbs_flow_spec_tcp_udp { struct ib_uverbs_flow_tcp_udp_filter mask; }; +struct ib_uverbs_flow_ipv6_filter { + __u8 src_ip[16]; + __u8 dst_ip[16]; +}; + +struct ib_uverbs_flow_spec_ipv6 { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_ipv6_filter val; + struct ib_uverbs_flow_ipv6_filter mask; +}; + struct ib_uverbs_flow_attr { __u32 type; __u16 size; @@ -946,4 +979,66 @@ struct ib_uverbs_destroy_srq_resp { __u32 events_reported; }; +struct ib_uverbs_ex_create_wq { + __u32 comp_mask; + __u32 wq_type; + __u64 user_handle; + __u32 pd_handle; + __u32 cq_handle; + __u32 max_wr; + __u32 max_sge; +}; + +struct ib_uverbs_ex_create_wq_resp { + __u32 comp_mask; + __u32 response_length; + __u32 wq_handle; + __u32 max_wr; + __u32 max_sge; + __u32 wqn; +}; + +struct ib_uverbs_ex_destroy_wq { + __u32 comp_mask; + __u32 wq_handle; +}; + +struct ib_uverbs_ex_destroy_wq_resp { + __u32 comp_mask; + __u32 response_length; + __u32 events_reported; + __u32 reserved; +}; + +struct ib_uverbs_ex_modify_wq { + __u32 attr_mask; + __u32 wq_handle; + __u32 wq_state; + __u32 curr_wq_state; +}; + +/* Prevent memory allocation rather than max expected size */ +#define IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE 0x0d +struct ib_uverbs_ex_create_rwq_ind_table { + __u32 comp_mask; + __u32 log_ind_tbl_size; + /* Following are the wq handles according to log_ind_tbl_size + * wq_handle1 + * wq_handle2 + */ + __u32 wq_handles[0]; +}; + +struct ib_uverbs_ex_create_rwq_ind_table_resp { + __u32 comp_mask; + __u32 response_length; + __u32 ind_tbl_handle; + __u32 ind_tbl_num; +}; + +struct ib_uverbs_ex_destroy_rwq_ind_table { + __u32 comp_mask; + __u32 ind_tbl_handle; +}; + #endif /* IB_USER_VERBS_H */ diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 3066718eb120..01923d463673 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -244,12 +244,19 @@ struct rdma_ucm_join_ip_mcast { __u32 id; }; +/* Multicast join flags */ +enum { + RDMA_MC_JOIN_FLAG_FULLMEMBER, + RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER, + RDMA_MC_JOIN_FLAG_RESERVED, +}; + struct rdma_ucm_join_mcast { __u64 response; /* rdma_ucma_create_id_resp */ __u64 uid; __u32 id; __u16 addr_size; - __u16 reserved; + __u16 join_flags; struct sockaddr_storage addr; }; diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h new file mode 100644 index 000000000000..1de99cfdaf7d --- /dev/null +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RDMA_USER_RXE_H +#define RDMA_USER_RXE_H + +#include <linux/types.h> + +union rxe_gid { + __u8 raw[16]; + struct { + __be64 subnet_prefix; + __be64 interface_id; + } global; +}; + +struct rxe_global_route { + union rxe_gid dgid; + __u32 flow_label; + __u8 sgid_index; + __u8 hop_limit; + __u8 traffic_class; +}; + +struct rxe_av { + __u8 port_num; + __u8 network_type; + struct rxe_global_route grh; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; +}; + +struct rxe_send_wr { + __u64 wr_id; + __u32 num_sge; + __u32 opcode; + __u32 send_flags; + union { + __be32 imm_data; + __u32 invalidate_rkey; + } ex; + union { + struct { + __u64 remote_addr; + __u32 rkey; + } rdma; + struct { + __u64 remote_addr; + __u64 compare_add; + __u64 swap; + __u32 rkey; + } atomic; + struct { + __u32 remote_qpn; + __u32 remote_qkey; + __u16 pkey_index; + } ud; + struct { + struct ib_mr *mr; + __u32 key; + int access; + } reg; + } wr; +}; + +struct rxe_sge { + __u64 addr; + __u32 length; + __u32 lkey; +}; + +struct mminfo { + __u64 offset; + __u32 size; + __u32 pad; +}; + +struct rxe_dma_info { + __u32 length; + __u32 resid; + __u32 cur_sge; + __u32 num_sge; + __u32 sge_offset; + union { + __u8 inline_data[0]; + struct rxe_sge sge[0]; + }; +}; + +struct rxe_send_wqe { + struct rxe_send_wr wr; + struct rxe_av av; + __u32 status; + __u32 state; + __u64 iova; + __u32 mask; + __u32 first_psn; + __u32 last_psn; + __u32 ack_length; + __u32 ssn; + __u32 has_rd_atomic; + struct rxe_dma_info dma; +}; + +struct rxe_recv_wqe { + __u64 wr_id; + __u32 num_sge; + __u32 padding; + struct rxe_dma_info dma; +}; + +#endif /* RDMA_USER_RXE_H */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 20400055f177..93ad6c1fb9b6 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -288,6 +288,9 @@ void __init jump_label_init(void) BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0); BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1); + if (static_key_initialized) + return; + jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); diff --git a/mm/Kconfig b/mm/Kconfig index c0837845c17c..78a23c5c302d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -187,6 +187,7 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG + depends on !KASAN config MEMORY_HOTPLUG_SPARSE def_bool y diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ef968306fd5b..b9aa1b0b38b0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3942,6 +3942,14 @@ same_page: return i ? i : -EFAULT; } +#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE +/* + * ARCHes with special requirements for evicting HUGETLB backing TLB entries can + * implement this. + */ +#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) +#endif + unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) { @@ -4002,7 +4010,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * once we release i_mmap_rwsem, another task can do the final put_page * and that page table be reused and filled with junk. */ - flush_tlb_range(vma, start, end); + flush_hugetlb_tlb_range(vma, start, end); mmu_notifier_invalidate_range(mm, start, end); i_mmap_unlock_write(vma->vm_file->f_mapping); mmu_notifier_invalidate_range_end(mm, start, end); diff --git a/mm/memblock.c b/mm/memblock.c index ff5ff3b5f1ea..483197ef613f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -482,7 +482,7 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) * @flags: flags of the new region * * Insert new memblock region [@base,@base+@size) into @type at @idx. - * @type must already have extra room to accomodate the new region. + * @type must already have extra room to accommodate the new region. */ static void __init_memblock memblock_insert_region(struct memblock_type *type, int idx, phys_addr_t base, @@ -544,7 +544,7 @@ repeat: /* * The following is executed twice. Once with %false @insert and * then with %true. The first counts the number of regions needed - * to accomodate the new area. The second actually inserts them. + * to accommodate the new area. The second actually inserts them. */ base = obase; nr_new = 0; @@ -994,7 +994,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, if (*idx == (u64)ULLONG_MAX) { idx_a = type_a->cnt - 1; - idx_b = type_b->cnt; + if (type_b != NULL) + idx_b = type_b->cnt; + else + idx_b = 0; } for (; idx_a >= 0; idx_a--) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 39a372a2a1d6..fb975cec3518 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5257,11 +5257,6 @@ static void __meminit setup_zone_pageset(struct zone *zone) zone->pageset = alloc_percpu(struct per_cpu_pageset); for_each_possible_cpu(cpu) zone_pageset_init(zone, cpu); - - if (!zone->zone_pgdat->per_cpu_nodestats) { - zone->zone_pgdat->per_cpu_nodestats = - alloc_percpu(struct per_cpu_nodestat); - } } /* @@ -5270,10 +5265,15 @@ static void __meminit setup_zone_pageset(struct zone *zone) */ void __init setup_per_cpu_pageset(void) { + struct pglist_data *pgdat; struct zone *zone; for_each_populated_zone(zone) setup_zone_pageset(zone); + + for_each_online_pgdat(pgdat) + pgdat->per_cpu_nodestats = + alloc_percpu(struct per_cpu_nodestat); } static noinline __ref diff --git a/mm/slub.c b/mm/slub.c index 26eb6a99540e..850737bdfbd8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -124,7 +124,7 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #endif } -inline void *fixup_red_left(struct kmem_cache *s, void *p) +void *fixup_red_left(struct kmem_cache *s, void *p) { if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) p += s->red_left_pad; diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index e085f5ae1548..1d281816f2bf 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1230,8 +1230,9 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, if (status) goto out; - dprintk("RPC: svcauth_gss: gss major status = %d\n", - ud.major_status); + dprintk("RPC: svcauth_gss: gss major status = %d " + "minor status = %d\n", + ud.major_status, ud.minor_status); switch (ud.major_status) { case GSS_S_CONTINUE_NEEDED: diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 553bf95f7003..4d8e11f94a35 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -362,7 +362,7 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd) cache_purge(cd); spin_lock(&cache_list_lock); write_lock(&cd->hash_lock); - if (cd->entries || atomic_read(&cd->inuse)) { + if (cd->entries) { write_unlock(&cd->hash_lock); spin_unlock(&cache_list_lock); goto out; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 4f01f63102ee..c3f652395a80 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -21,6 +21,10 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +static unsigned int svc_rpc_per_connection_limit __read_mostly; +module_param(svc_rpc_per_connection_limit, uint, 0644); + + static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt); static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); @@ -329,12 +333,45 @@ char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len) } EXPORT_SYMBOL_GPL(svc_print_addr); +static bool svc_xprt_slots_in_range(struct svc_xprt *xprt) +{ + unsigned int limit = svc_rpc_per_connection_limit; + int nrqsts = atomic_read(&xprt->xpt_nr_rqsts); + + return limit == 0 || (nrqsts >= 0 && nrqsts < limit); +} + +static bool svc_xprt_reserve_slot(struct svc_rqst *rqstp, struct svc_xprt *xprt) +{ + if (!test_bit(RQ_DATA, &rqstp->rq_flags)) { + if (!svc_xprt_slots_in_range(xprt)) + return false; + atomic_inc(&xprt->xpt_nr_rqsts); + set_bit(RQ_DATA, &rqstp->rq_flags); + } + return true; +} + +static void svc_xprt_release_slot(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + if (test_and_clear_bit(RQ_DATA, &rqstp->rq_flags)) { + atomic_dec(&xprt->xpt_nr_rqsts); + svc_xprt_enqueue(xprt); + } +} + static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt) { if (xprt->xpt_flags & ((1<<XPT_CONN)|(1<<XPT_CLOSE))) return true; - if (xprt->xpt_flags & ((1<<XPT_DATA)|(1<<XPT_DEFERRED))) - return xprt->xpt_ops->xpo_has_wspace(xprt); + if (xprt->xpt_flags & ((1<<XPT_DATA)|(1<<XPT_DEFERRED))) { + if (xprt->xpt_ops->xpo_has_wspace(xprt) && + svc_xprt_slots_in_range(xprt)) + return true; + trace_svc_xprt_no_write_space(xprt); + return false; + } return false; } @@ -480,8 +517,6 @@ void svc_reserve(struct svc_rqst *rqstp, int space) atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved); rqstp->rq_reserved = space; - if (xprt->xpt_ops->xpo_adjust_wspace) - xprt->xpt_ops->xpo_adjust_wspace(xprt); svc_xprt_enqueue(xprt); } } @@ -512,8 +547,8 @@ static void svc_xprt_release(struct svc_rqst *rqstp) rqstp->rq_res.head[0].iov_len = 0; svc_reserve(rqstp, 0); + svc_xprt_release_slot(rqstp); rqstp->rq_xprt = NULL; - svc_xprt_put(xprt); } @@ -781,7 +816,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) svc_add_new_temp_xprt(serv, newxpt); else module_put(xprt->xpt_class->xcl_owner); - } else { + } else if (svc_xprt_reserve_slot(rqstp, xprt)) { /* XPT_DATA|XPT_DEFERRED case: */ dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", rqstp, rqstp->rq_pool->sp_id, xprt, @@ -871,6 +906,7 @@ EXPORT_SYMBOL_GPL(svc_recv); */ void svc_drop(struct svc_rqst *rqstp) { + trace_svc_drop(rqstp); dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt); svc_xprt_release(rqstp); } @@ -1148,6 +1184,7 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) spin_unlock(&xprt->xpt_lock); dprintk("revisit canceled\n"); svc_xprt_put(xprt); + trace_svc_drop_deferred(dr); kfree(dr); return; } @@ -1205,6 +1242,7 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req) set_bit(RQ_DROPME, &rqstp->rq_flags); dr->handle.revisit = svc_revisit; + trace_svc_defer(rqstp); return &dr->handle; } @@ -1245,6 +1283,7 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) struct svc_deferred_req, handle.recent); list_del_init(&dr->handle.recent); + trace_svc_revisit_deferred(dr); } else clear_bit(XPT_DEFERRED, &xprt->xpt_flags); spin_unlock(&xprt->xpt_lock); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index dadfec66dbd8..57625f64efd5 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -60,7 +60,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, int flags); -static void svc_udp_data_ready(struct sock *); static int svc_udp_recvfrom(struct svc_rqst *); static int svc_udp_sendto(struct svc_rqst *); static void svc_sock_detach(struct svc_xprt *); @@ -398,48 +397,21 @@ static int svc_sock_secure_port(struct svc_rqst *rqstp) return svc_port_is_privileged(svc_addr(rqstp)); } -static bool sunrpc_waitqueue_active(wait_queue_head_t *wq) -{ - if (!wq) - return false; - /* - * There should normally be a memory * barrier here--see - * wq_has_sleeper(). - * - * It appears that isn't currently necessary, though, basically - * because callers all appear to have sufficient memory barriers - * between the time the relevant change is made and the - * time they call these callbacks. - * - * The nfsd code itself doesn't actually explicitly wait on - * these waitqueues, but it may wait on them for example in - * sendpage() or sendmsg() calls. (And those may be the only - * places, since it it uses nonblocking reads.) - * - * Maybe we should add the memory barriers anyway, but these are - * hot paths so we'd need to be convinced there's no sigificant - * penalty. - */ - return waitqueue_active(wq); -} - /* * INET callback when data has been received on the socket. */ -static void svc_udp_data_ready(struct sock *sk) +static void svc_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - wait_queue_head_t *wq = sk_sleep(sk); if (svsk) { dprintk("svc: socket %p(inet %p), busy=%d\n", svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); - set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); + svsk->sk_odata(sk); + if (!test_and_set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags)) + svc_xprt_enqueue(&svsk->sk_xprt); } - if (sunrpc_waitqueue_active(wq)) - wake_up_interruptible(wq); } /* @@ -448,56 +420,22 @@ static void svc_udp_data_ready(struct sock *sk) static void svc_write_space(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); - wait_queue_head_t *wq = sk_sleep(sk); if (svsk) { dprintk("svc: socket %p(inet %p), write_space busy=%d\n", svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); + svsk->sk_owspace(sk); svc_xprt_enqueue(&svsk->sk_xprt); } - - if (sunrpc_waitqueue_active(wq)) { - dprintk("RPC svc_write_space: someone sleeping on %p\n", - svsk); - wake_up_interruptible(wq); - } } static int svc_tcp_has_wspace(struct svc_xprt *xprt) { - struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - struct svc_serv *serv = svsk->sk_xprt.xpt_server; - int required; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) return 1; - required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg; - if (sk_stream_wspace(svsk->sk_sk) >= required || - (sk_stream_min_wspace(svsk->sk_sk) == 0 && - atomic_read(&xprt->xpt_reserved) == 0)) - return 1; - set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - return 0; -} - -static void svc_tcp_write_space(struct sock *sk) -{ - struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); - struct socket *sock = sk->sk_socket; - - if (!sk_stream_is_writeable(sk) || !sock) - return; - if (!svsk || svc_tcp_has_wspace(&svsk->sk_xprt)) - clear_bit(SOCK_NOSPACE, &sock->flags); - svc_write_space(sk); -} - -static void svc_tcp_adjust_wspace(struct svc_xprt *xprt) -{ - struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - - if (svc_tcp_has_wspace(xprt)) - clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + return !test_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); } /* @@ -746,7 +684,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class, &svsk->sk_xprt, serv); clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); - svsk->sk_sk->sk_data_ready = svc_udp_data_ready; + svsk->sk_sk->sk_data_ready = svc_data_ready; svsk->sk_sk->sk_write_space = svc_write_space; /* initialise setting must have enough space to @@ -786,11 +724,12 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) static void svc_tcp_listen_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - wait_queue_head_t *wq; dprintk("svc: socket %p TCP (listen) state change %d\n", sk, sk->sk_state); + if (svsk) + svsk->sk_odata(sk); /* * This callback may called twice when a new connection * is established as a child socket inherits everything @@ -808,10 +747,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk) } else printk("svc: socket %p: no user data\n", sk); } - - wq = sk_sleep(sk); - if (sunrpc_waitqueue_active(wq)) - wake_up_interruptible_all(wq); } /* @@ -820,7 +755,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk) static void svc_tcp_state_change(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - wait_queue_head_t *wq = sk_sleep(sk); dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n", sk, sk->sk_state, sk->sk_user_data); @@ -828,26 +762,12 @@ static void svc_tcp_state_change(struct sock *sk) if (!svsk) printk("svc: socket %p: no user data\n", sk); else { - set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); - } - if (sunrpc_waitqueue_active(wq)) - wake_up_interruptible_all(wq); -} - -static void svc_tcp_data_ready(struct sock *sk) -{ - struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - wait_queue_head_t *wq = sk_sleep(sk); - - dprintk("svc: socket %p TCP data ready (svsk %p)\n", - sk, sk->sk_user_data); - if (svsk) { - set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); + svsk->sk_ostate(sk); + if (sk->sk_state != TCP_ESTABLISHED) { + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); + } } - if (sunrpc_waitqueue_active(wq)) - wake_up_interruptible(wq); } /* @@ -901,6 +821,11 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) dprintk("%s: connect from %s\n", serv->sv_name, __svc_print_addr(sin, buf, sizeof(buf))); + /* Reset the inherited callbacks before calling svc_setup_socket */ + newsock->sk->sk_state_change = svsk->sk_ostate; + newsock->sk->sk_data_ready = svsk->sk_odata; + newsock->sk->sk_write_space = svsk->sk_owspace; + /* make sure that a write doesn't block forever when * low on memory */ @@ -1317,7 +1242,6 @@ static struct svc_xprt_ops svc_tcp_ops = { .xpo_has_wspace = svc_tcp_has_wspace, .xpo_accept = svc_tcp_accept, .xpo_secure_port = svc_sock_secure_port, - .xpo_adjust_wspace = svc_tcp_adjust_wspace, }; static struct svc_xprt_class svc_tcp_class = { @@ -1357,8 +1281,8 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) } else { dprintk("setting up TCP socket for reading\n"); sk->sk_state_change = svc_tcp_state_change; - sk->sk_data_ready = svc_tcp_data_ready; - sk->sk_write_space = svc_tcp_write_space; + sk->sk_data_ready = svc_data_ready; + sk->sk_write_space = svc_write_space; svsk->sk_reclen = 0; svsk->sk_tcplen = 0; @@ -1368,8 +1292,13 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - if (sk->sk_state != TCP_ESTABLISHED) + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + break; + default: set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + } } } @@ -1428,17 +1357,14 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, /* Initialize the socket */ if (sock->type == SOCK_DGRAM) svc_udp_init(svsk, serv); - else { - /* initialise setting must have enough space to - * receive and respond to one request. - */ - svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg, - 4 * serv->sv_max_mesg); + else svc_tcp_init(svsk, serv); - } - dprintk("svc: svc_setup_socket created %p (inet %p)\n", - svsk, svsk->sk_sk); + dprintk("svc: svc_setup_socket created %p (inet %p), " + "listen %d close %d\n", + svsk, svsk->sk_sk, + test_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags), + test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); return svsk; } @@ -1606,18 +1532,16 @@ static void svc_sock_detach(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); struct sock *sk = svsk->sk_sk; - wait_queue_head_t *wq; dprintk("svc: svc_sock_detach(%p)\n", svsk); /* put back the old socket callbacks */ + lock_sock(sk); sk->sk_state_change = svsk->sk_ostate; sk->sk_data_ready = svsk->sk_odata; sk->sk_write_space = svsk->sk_owspace; - - wq = sk_sleep(sk); - if (sunrpc_waitqueue_active(wq)) - wake_up_interruptible(wq); + sk->sk_user_data = NULL; + release_sock(sk); } /* diff --git a/sound/arm/Kconfig b/sound/arm/Kconfig index e0406211716b..65171f6657a2 100644 --- a/sound/arm/Kconfig +++ b/sound/arm/Kconfig @@ -9,14 +9,6 @@ menuconfig SND_ARM Drivers that are implemented on ASoC can be found in "ALSA for SoC audio support" section. -config SND_PXA2XX_LIB - tristate - select SND_AC97_CODEC if SND_PXA2XX_LIB_AC97 - select SND_DMAENGINE_PCM - -config SND_PXA2XX_LIB_AC97 - bool - if SND_ARM config SND_ARMAACI @@ -42,3 +34,10 @@ config SND_PXA2XX_AC97 endif # SND_ARM +config SND_PXA2XX_LIB + tristate + select SND_AC97_CODEC if SND_PXA2XX_LIB_AC97 + select SND_DMAENGINE_PCM + +config SND_PXA2XX_LIB_AC97 + bool diff --git a/sound/hda/array.c b/sound/hda/array.c index 516795baa7db..5dfa610e4471 100644 --- a/sound/hda/array.c +++ b/sound/hda/array.c @@ -21,13 +21,15 @@ void *snd_array_new(struct snd_array *array) return NULL; if (array->used >= array->alloced) { int num = array->alloced + array->alloc_align; + int oldsize = array->alloced * array->elem_size; int size = (num + 1) * array->elem_size; void *nlist; if (snd_BUG_ON(num >= 4096)) return NULL; - nlist = krealloc(array->list, size, GFP_KERNEL | __GFP_ZERO); + nlist = krealloc(array->list, size, GFP_KERNEL); if (!nlist) return NULL; + memset(nlist + oldsize, 0, size - oldsize); array->list = nlist; array->alloced = num; } diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 6f8ea13323c1..89dacf9b4e6c 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2265,6 +2265,8 @@ static const struct pci_device_id azx_ids[] = { { PCI_DEVICE(0x1022, 0x780d), .driver_data = AZX_DRIVER_GENERIC | AZX_DCAPS_PRESET_ATI_SB }, /* ATI HDMI */ + { PCI_DEVICE(0x1002, 0x0002), + .driver_data = AZX_DRIVER_ATIHDMI_NS | AZX_DCAPS_PRESET_ATI_HDMI_NS }, { PCI_DEVICE(0x1002, 0x1308), .driver_data = AZX_DRIVER_ATIHDMI_NS | AZX_DCAPS_PRESET_ATI_HDMI_NS }, { PCI_DEVICE(0x1002, 0x157a), diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index ddd29b9819ba..574b1b48996f 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -4689,6 +4689,22 @@ static void alc290_fixup_mono_speakers(struct hda_codec *codec, } } +static void alc298_fixup_speaker_volume(struct hda_codec *codec, + const struct hda_fixup *fix, int action) +{ + if (action == HDA_FIXUP_ACT_PRE_PROBE) { + /* The speaker is routed to the Node 0x06 by a mistake, as a result + we can't adjust the speaker's volume since this node does not has + Amp-out capability. we change the speaker's route to: + Node 0x02 (Audio Output) -> Node 0x0c (Audio Mixer) -> Node 0x17 ( + Pin Complex), since Node 0x02 has Amp-out caps, we can adjust + speaker's volume now. */ + + hda_nid_t conn1[1] = { 0x0c }; + snd_hda_override_conn_list(codec, 0x17, 1, conn1); + } +} + /* Hook to update amp GPIO4 for automute */ static void alc280_hp_gpio4_automute_hook(struct hda_codec *codec, struct hda_jack_callback *jack) @@ -4838,6 +4854,7 @@ enum { ALC280_FIXUP_HP_HEADSET_MIC, ALC221_FIXUP_HP_FRONT_MIC, ALC292_FIXUP_TPT460, + ALC298_FIXUP_SPK_VOLUME, }; static const struct hda_fixup alc269_fixups[] = { @@ -5493,6 +5510,12 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC293_FIXUP_LENOVO_SPK_NOISE, }, + [ALC298_FIXUP_SPK_VOLUME] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc298_fixup_speaker_volume, + .chained = true, + .chain_id = ALC298_FIXUP_DELL1_MIC_NO_PRESENCE, + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -5539,6 +5562,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0704, "Dell XPS 13 9350", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE), SND_PCI_QUIRK(0x1028, 0x0725, "Dell Inspiron 3162", ALC255_FIXUP_DELL_SPK_NOISE), SND_PCI_QUIRK(0x1028, 0x075b, "Dell XPS 13 9360", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE), + SND_PCI_QUIRK(0x1028, 0x075d, "Dell AIO", ALC298_FIXUP_SPK_VOLUME), SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2), @@ -5814,6 +5838,10 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { {0x1b, 0x01014020}, {0x21, 0x0221103f}), SND_HDA_PIN_QUIRK(0x10ec0255, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, + {0x14, 0x90170130}, + {0x1b, 0x02011020}, + {0x21, 0x0221103f}), + SND_HDA_PIN_QUIRK(0x10ec0255, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, {0x14, 0x90170150}, {0x1b, 0x02011020}, {0x21, 0x0221105f}), @@ -6549,6 +6577,7 @@ enum { ALC668_FIXUP_ASUS_Nx51, ALC891_FIXUP_HEADSET_MODE, ALC891_FIXUP_DELL_MIC_NO_PRESENCE, + ALC662_FIXUP_ACER_VERITON, }; static const struct hda_fixup alc662_fixups[] = { @@ -6818,6 +6847,13 @@ static const struct hda_fixup alc662_fixups[] = { .chained = true, .chain_id = ALC891_FIXUP_HEADSET_MODE }, + [ALC662_FIXUP_ACER_VERITON] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x15, 0x50170120 }, /* no internal speaker */ + { } + } + }, }; static const struct snd_pci_quirk alc662_fixup_tbl[] = { @@ -6856,6 +6892,7 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x38af, "Lenovo Ideapad Y550P", ALC662_FIXUP_IDEAPAD), SND_PCI_QUIRK(0x17aa, 0x3a0d, "Lenovo Ideapad Y550", ALC662_FIXUP_IDEAPAD), SND_PCI_QUIRK(0x19da, 0xa130, "Zotac Z68", ALC662_FIXUP_ZOTAC_Z68), + SND_PCI_QUIRK(0x1b0a, 0x01b8, "ACER Veriton", ALC662_FIXUP_ACER_VERITON), SND_PCI_QUIRK(0x1b35, 0x2206, "CZC P10T", ALC662_FIXUP_CZC_P10T), #if 0 diff --git a/tools/testing/selftests/timers/rtctest.c b/tools/testing/selftests/timers/rtctest.c index 624bce51b27d..4230d3052e5d 100644 --- a/tools/testing/selftests/timers/rtctest.c +++ b/tools/testing/selftests/timers/rtctest.c @@ -144,11 +144,12 @@ test_READ: retval = ioctl(fd, RTC_ALM_SET, &rtc_tm); if (retval == -1) { - if (errno == ENOTTY) { + if (errno == EINVAL) { fprintf(stderr, "\n...Alarm IRQs not supported.\n"); goto test_PIE; } + perror("RTC_ALM_SET ioctl"); exit(errno); } @@ -166,6 +167,12 @@ test_READ: /* Enable alarm interrupts */ retval = ioctl(fd, RTC_AIE_ON, 0); if (retval == -1) { + if (errno == EINVAL) { + fprintf(stderr, + "\n...Alarm IRQs not supported.\n"); + goto test_PIE; + } + perror("RTC_AIE_ON ioctl"); exit(errno); } @@ -193,7 +200,7 @@ test_PIE: retval = ioctl(fd, RTC_IRQP_READ, &tmp); if (retval == -1) { /* not all RTCs support periodic IRQs */ - if (errno == ENOTTY) { + if (errno == EINVAL) { fprintf(stderr, "\nNo periodic IRQ support\n"); goto done; } @@ -211,7 +218,7 @@ test_PIE: retval = ioctl(fd, RTC_IRQP_SET, tmp); if (retval == -1) { /* not all RTCs can change their periodic IRQ rate */ - if (errno == ENOTTY) { + if (errno == EINVAL) { fprintf(stderr, "\n...Periodic IRQ rate is fixed\n"); goto done; |