33 files changed, 1321 insertions, 315 deletions
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index e7cee0a5c56d..78c0621d5819 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -8,6 +8,7 @@
 
 config ARC
 	def_bool y
+	select ARCH_SUPPORTS_ATOMIC_RMW if ARC_HAS_LLSC
 	select BUILDTIME_EXTABLE_SORT
 	select COMMON_CLK
 	select CLONE_BACKWARDS
@@ -22,6 +23,7 @@ config ARC
 	select GENERIC_SMP_IDLE_THREAD
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_TRACEHOOK
+	select HAVE_FUTEX_CMPXCHG
 	select HAVE_IOREMAP_PROT
 	select HAVE_KPROBES
 	select HAVE_KRETPROBES
@@ -115,6 +117,7 @@ if ISA_ARCOMPACT
 
 config ARC_CPU_750D
 	bool "ARC750D"
+	select ARC_CANT_LLSC
 	help
 	  Support for ARC750 core
 
@@ -312,11 +315,11 @@ config ARC_PAGE_SIZE_8K
 
 config ARC_PAGE_SIZE_16K
 	bool "16KB"
-	depends on ARC_MMU_V3
+	depends on ARC_MMU_V3 || ARC_MMU_V4
 
 config ARC_PAGE_SIZE_4K
 	bool "4KB"
-	depends on ARC_MMU_V3
+	depends on ARC_MMU_V3 || ARC_MMU_V4
 
 endchoice
 
@@ -362,7 +365,12 @@ config ARC_CANT_LLSC
 config ARC_HAS_LLSC
 	bool "Insn: LLOCK/SCOND (efficient atomic ops)"
 	default y
-	depends on !ARC_CPU_750D && !ARC_CANT_LLSC
+	depends on !ARC_CANT_LLSC
+
+config ARC_STAR_9000923308
+	bool "Workaround for llock/scond livelock"
+	default y
+	depends on ISA_ARCV2 && SMP && ARC_HAS_LLSC
 
 config ARC_HAS_SWAPE
 	bool "Insn: SWAPE (endian-swap)"
@@ -378,6 +386,10 @@ config ARC_HAS_LL64
 	  dest operands with 2 possible source operands.
 	default y
 
+config ARC_HAS_DIV_REM
+	bool "Insn: div, divu, rem, remu"
+	default y
+
 config ARC_HAS_RTC
 	bool "Local 64-bit r/o cycle counter"
 	default n
diff --git a/arch/arc/Makefile b/arch/arc/Makefile
index 6107062c0111..8a27a48304a4 100644
--- a/arch/arc/Makefile
+++ b/arch/arc/Makefile
@@ -36,8 +36,16 @@ cflags-$(atleast_gcc44)			+= -fsection-anchors
 cflags-$(CONFIG_ARC_HAS_LLSC)		+= -mlock
 cflags-$(CONFIG_ARC_HAS_SWAPE)		+= -mswape
 
+ifdef CONFIG_ISA_ARCV2
+
 ifndef CONFIG_ARC_HAS_LL64
-cflags-$(CONFIG_ISA_ARCV2)		+= -mno-ll64
+cflags-y				+= -mno-ll64
+endif
+
+ifndef CONFIG_ARC_HAS_DIV_REM
+cflags-y				+= -mno-div-rem
+endif
+
 endif
 
 cflags-$(CONFIG_ARC_DW2_UNWIND)		+= -fasynchronous-unwind-tables
@@ -49,7 +57,8 @@ endif
 
 ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE
 # Generic build system uses -O2, we want -O3
-cflags-y  += -O3
+# Note: No need to add to cflags-y as that happens anyways
+ARCH_CFLAGS += -O3
 endif
 
 # small data is default for elf32 tool-chain. If not usable, disable it
diff --git a/arch/arc/boot/dts/axc003.dtsi b/arch/arc/boot/dts/axc003.dtsi
index 15c8d6226c9d..846481f37eef 100644
--- a/arch/arc/boot/dts/axc003.dtsi
+++ b/arch/arc/boot/dts/axc003.dtsi
@@ -12,7 +12,7 @@
 
 / {
 	compatible = "snps,arc";
-	clock-frequency = <75000000>;
+	clock-frequency = <90000000>;
 	#address-cells = <1>;
 	#size-cells = <1>;
 
@@ -72,12 +72,13 @@
 	};
 
 	/*
-	 * This INTC is actually connected to DW APB GPIO
-	 * which acts as a wire between MB INTC and CPU INTC.
-	 * GPIO INTC is configured in platform init code
-	 * and here we mimic direct connection from MB INTC to
-	 * CPU INTC, thus we set "interrupts = <7>" instead of
-	 * "interrupts = <12>"
+	 * The DW APB ICTL intc on MB is connected to CPU intc via a
+	 * DT "invisible" DW APB GPIO block, configured to simply pass thru
+	 * interrupts - setup accordinly in platform init (plat-axs10x/ax10x.c)
+	 *
+	 * So here we mimic a direct connection betwen them, ignoring the
+	 * ABPG GPIO. Thus set "interrupts = <24>" (DW APB GPIO to core)
+	 * instead of "interrupts = <12>" (DW APB ICTL to DW APB GPIO)
 	 *
 	 * This intc actually resides on MB, but we move it here to
 	 * avoid duplicating the MB dtsi file given that IRQ from
diff --git a/arch/arc/boot/dts/axc003_idu.dtsi b/arch/arc/boot/dts/axc003_idu.dtsi
index 199d42820eca..2f0b33257db2 100644
--- a/arch/arc/boot/dts/axc003_idu.dtsi
+++ b/arch/arc/boot/dts/axc003_idu.dtsi
@@ -12,7 +12,7 @@
 
 / {
 	compatible = "snps,arc";
-	clock-frequency = <75000000>;
+	clock-frequency = <90000000>;
 	#address-cells = <1>;
 	#size-cells = <1>;
 
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index 1a80cc91a03b..7611b10a2d23 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -22,6 +22,7 @@ generic-y += kvm_para.h
 generic-y += local.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
+generic-y += mm-arch-hooks.h
 generic-y += mman.h
 generic-y += msgbuf.h
 generic-y += param.h
diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h
index 070f58827a5c..d8023bc8d1ad 100644
--- a/arch/arc/include/asm/arcregs.h
+++ b/arch/arc/include/asm/arcregs.h
@@ -35,6 +35,7 @@
 #define ARC_REG_RTT_BCR		0xF2
 #define ARC_REG_IRQ_BCR		0xF3
 #define ARC_REG_SMART_BCR	0xFF
+#define ARC_REG_CLUSTER_BCR	0xcf
 
 /* status32 Bits Positions */
 #define STATUS_AE_BIT		5	/* Exception active */
@@ -89,11 +90,10 @@
 #define ECR_C_BIT_DTLB_LD_MISS		8
 #define ECR_C_BIT_DTLB_ST_MISS		9
 
-
 /* Auxiliary registers */
 #define AUX_IDENTITY		4
 #define AUX_INTR_VEC_BASE	0x25
-
+#define AUX_NON_VOL		0x5e
 
 /*
  * Floating Pt Registers
@@ -240,9 +240,9 @@ struct bcr_extn_xymem {
 
 struct bcr_perip {
 #ifdef CONFIG_CPU_BIG_ENDIAN
-	unsigned int start:8, pad2:8, sz:8, pad:8;
+	unsigned int start:8, pad2:8, sz:8, ver:8;
 #else
-	unsigned int pad:8, sz:8, pad2:8, start:8;
+	unsigned int ver:8, sz:8, pad2:8, start:8;
 #endif
 };
 
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 03484cb4d16d..c3ecda023e3a 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -23,33 +23,60 @@
 
 #define atomic_set(v, i) (((v)->counter) = (i))
 
-#ifdef CONFIG_ISA_ARCV2
-#define PREFETCHW	"	prefetchw   [%1]	\n"
-#else
-#define PREFETCHW
+#ifdef CONFIG_ARC_STAR_9000923308
+
+#define SCOND_FAIL_RETRY_VAR_DEF						\
+	unsigned int delay = 1, tmp;						\
+
+#define SCOND_FAIL_RETRY_ASM							\
+	"	bz	4f			\n"				\
+	"   ; --- scond fail delay ---		\n"				\
+	"	mov	%[tmp], %[delay]	\n"	/* tmp = delay */	\
+	"2: 	brne.d	%[tmp], 0, 2b		\n"	/* while (tmp != 0) */	\
+	"	sub	%[tmp], %[tmp], 1	\n"	/* tmp-- */		\
+	"	rol	%[delay], %[delay]	\n"	/* delay *= 2 */	\
+	"	b	1b			\n"	/* start over */	\
+	"4: ; --- success ---			\n"				\
+
+#define SCOND_FAIL_RETRY_VARS							\
+	  ,[delay] "+&r" (delay),[tmp] "=&r"	(tmp)				\
+
+#else	/* !CONFIG_ARC_STAR_9000923308 */
+
+#define SCOND_FAIL_RETRY_VAR_DEF
+
+#define SCOND_FAIL_RETRY_ASM							\
+	"	bnz     1b			\n"				\
+
+#define SCOND_FAIL_RETRY_VARS
+
 #endif
 
 #define ATOMIC_OP(op, c_op, asm_op)					\
 static inline void atomic_##op(int i, atomic_t *v)			\
 {									\
-	unsigned int temp;						\
+	unsigned int val;				                \
+	SCOND_FAIL_RETRY_VAR_DEF                                        \
 									\
 	__asm__ __volatile__(						\
-	"1:				\n"				\
-	PREFETCHW							\
-	"	llock   %0, [%1]	\n"				\
-	"	" #asm_op " %0, %0, %2	\n"				\
-	"	scond   %0, [%1]	\n"				\
-	"	bnz     1b		\n"				\
-	: "=&r"(temp)	/* Early clobber, to prevent reg reuse */	\
-	: "r"(&v->counter), "ir"(i)					\
+	"1:	llock   %[val], [%[ctr]]		\n"		\
+	"	" #asm_op " %[val], %[val], %[i]	\n"		\
+	"	scond   %[val], [%[ctr]]		\n"		\
+	"						\n"		\
+	SCOND_FAIL_RETRY_ASM						\
+									\
+	: [val]	"=&r"	(val) /* Early clobber to prevent reg reuse */	\
+	  SCOND_FAIL_RETRY_VARS						\
+	: [ctr]	"r"	(&v->counter), /* Not "m": llock only supports reg direct addr mode */	\
+	  [i]	"ir"	(i)						\
 	: "cc");							\
 }									\
 
 #define ATOMIC_OP_RETURN(op, c_op, asm_op)				\
 static inline int atomic_##op##_return(int i, atomic_t *v)		\
 {									\
-	unsigned int temp;						\
+	unsigned int val;				                \
+	SCOND_FAIL_RETRY_VAR_DEF                                        \
 									\
 	/*								\
 	 * Explicit full memory barrier needed before/after as		\
@@ -58,19 +85,21 @@ static inline int atomic_##op##_return(int i, atomic_t *v)		\
 	smp_mb();							\
 									\
 	__asm__ __volatile__(						\
-	"1:				\n"				\
-	PREFETCHW							\
-	"	llock   %0, [%1]	\n"				\
-	"	" #asm_op " %0, %0, %2	\n"				\
-	"	scond   %0, [%1]	\n"				\
-	"	bnz     1b		\n"				\
-	: "=&r"(temp)							\
-	: "r"(&v->counter), "ir"(i)					\
+	"1:	llock   %[val], [%[ctr]]		\n"		\
+	"	" #asm_op " %[val], %[val], %[i]	\n"		\
+	"	scond   %[val], [%[ctr]]		\n"		\
+	"						\n"		\
+	SCOND_FAIL_RETRY_ASM						\
+									\
+	: [val]	"=&r"	(val)						\
+	  SCOND_FAIL_RETRY_VARS						\
+	: [ctr]	"r"	(&v->counter),					\
+	  [i]	"ir"	(i)						\
 	: "cc");							\
 									\
 	smp_mb();							\
 									\
-	return temp;							\
+	return val;							\
 }
 
 #else	/* !CONFIG_ARC_HAS_LLSC */
@@ -143,13 +172,20 @@ static inline int atomic_##op##_return(int i, atomic_t *v)		\
 
 ATOMIC_OPS(add, +=, add)
 ATOMIC_OPS(sub, -=, sub)
-ATOMIC_OP(and, &=, and)
 
-#define atomic_clear_mask(mask, v) atomic_and(~(mask), (v))
+#define atomic_andnot atomic_andnot
+
+ATOMIC_OP(and, &=, and)
+ATOMIC_OP(andnot, &= ~, bic)
+ATOMIC_OP(or, |=, or)
+ATOMIC_OP(xor, ^=, xor)
 
 #undef ATOMIC_OPS
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
+#undef SCOND_FAIL_RETRY_VAR_DEF
+#undef SCOND_FAIL_RETRY_ASM
+#undef SCOND_FAIL_RETRY_VARS
 
 /**
  * __atomic_add_unless - add unless the number is a given value
diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h
index 99fe118d3730..57c1f33844d4 100644
--- a/arch/arc/include/asm/bitops.h
+++ b/arch/arc/include/asm/bitops.h
@@ -50,8 +50,7 @@ static inline void op##_bit(unsigned long nr, volatile unsigned long *m)\
 	 * done for const @nr, but no code is generated due to gcc	\
 	 * const prop.							\
 	 */								\
-	if (__builtin_constant_p(nr))					\
-		nr &= 0x1f;						\
+	nr &= 0x1f;							\
 									\
 	__asm__ __volatile__(						\
 	"1:	llock       %0, [%1]		\n"			\
@@ -82,8 +81,7 @@ static inline int test_and_##op##_bit(unsigned long nr, volatile unsigned long *
 									\
 	m += nr >> 5;							\
 									\
-	if (__builtin_constant_p(nr))					\
-		nr &= 0x1f;						\
+	nr &= 0x1f;							\
 									\
 	/*								\
 	 * Explicit full memory barrier needed before/after as		\
@@ -129,16 +127,13 @@ static inline void op##_bit(unsigned long nr, volatile unsigned long *m)\
 	unsigned long temp, flags;					\
 	m += nr >> 5;							\
 									\
-	if (__builtin_constant_p(nr))					\
-		nr &= 0x1f;						\
-									\
 	/*								\
 	 * spin lock/unlock provide the needed smp_mb() before/after	\
 	 */								\
 	bitops_lock(flags);						\
 									\
 	temp = *m;							\
-	*m = temp c_op (1UL << nr);					\
+	*m = temp c_op (1UL << (nr & 0x1f));					\
 									\
 	bitops_unlock(flags);						\
 }
@@ -149,17 +144,14 @@ static inline int test_and_##op##_bit(unsigned long nr, volatile unsigned long *
 	unsigned long old, flags;					\
 	m += nr >> 5;							\
 									\
-	if (__builtin_constant_p(nr))					\
-		nr &= 0x1f;						\
-									\
 	bitops_lock(flags);						\
 									\
 	old = *m;							\
-	*m = old c_op (1 << nr);					\
+	*m = old c_op (1UL << (nr & 0x1f));				\
 									\
 	bitops_unlock(flags);						\
 									\
-	return (old & (1 << nr)) != 0;					\
+	return (old & (1UL << (nr & 0x1f))) != 0;			\
 }
 
 #endif /* CONFIG_ARC_HAS_LLSC */
@@ -174,11 +166,8 @@ static inline void __##op##_bit(unsigned long nr, volatile unsigned long *m)	\
 	unsigned long temp;						\
 	m += nr >> 5;							\
 									\
-	if (__builtin_constant_p(nr))					\
-		nr &= 0x1f;						\
-									\
 	temp = *m;							\
-	*m = temp c_op (1UL << nr);					\
+	*m = temp c_op (1UL << (nr & 0x1f));				\
 }
 
 #define __TEST_N_BIT_OP(op, c_op, asm_op)				\
@@ -187,13 +176,10 @@ static inline int __test_and_##op##_bit(unsigned long nr, volatile unsigned long
 	unsigned long old;						\
 	m += nr >> 5;							\
 									\
-	if (__builtin_constant_p(nr))					\
-		nr &= 0x1f;						\
-									\
 	old = *m;							\
-	*m = old c_op (1 << nr);					\
+	*m = old c_op (1UL << (nr & 0x1f));				\
 									\
-	return (old & (1 << nr)) != 0;					\
+	return (old & (1UL << (nr & 0x1f))) != 0;			\
 }
 
 #define BIT_OPS(op, c_op, asm_op)					\
@@ -224,10 +210,7 @@ test_bit(unsigned int nr, const volatile unsigned long *addr)
 
 	addr += nr >> 5;
 
-	if (__builtin_constant_p(nr))
-		nr &= 0x1f;
-
-	mask = 1 << nr;
+	mask = 1UL << (nr & 0x1f);
 
 	return ((mask & *addr) != 0);
 }
diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h
index d67345d3e2d4..e23ea6e7633a 100644
--- a/arch/arc/include/asm/cache.h
+++ b/arch/arc/include/asm/cache.h
@@ -53,6 +53,8 @@ extern void arc_cache_init(void);
 extern char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len);
 extern void read_decode_cache_bcr(void);
 
+extern int ioc_exists;
+
 #endif	/* !__ASSEMBLY__ */
 
 /* Instruction cache related Auxiliary registers */
@@ -94,4 +96,10 @@ extern void read_decode_cache_bcr(void);
 #define SLC_CTRL_BUSY		0x100
 #define SLC_CTRL_RGN_OP_INV	0x200
 
+/* IO coherency related Auxiliary registers */
+#define ARC_REG_IO_COH_ENABLE	0x500
+#define ARC_REG_IO_COH_PARTIAL	0x501
+#define ARC_REG_IO_COH_AP0_BASE	0x508
+#define ARC_REG_IO_COH_AP0_SIZE	0x509
+
 #endif /* _ASM_CACHE_H */
diff --git a/arch/arc/include/asm/cmpxchg.h b/arch/arc/include/asm/cmpxchg.h
index 44fd531f4d7b..af7a2db139c9 100644
--- a/arch/arc/include/asm/cmpxchg.h
+++ b/arch/arc/include/asm/cmpxchg.h
@@ -110,18 +110,18 @@ static inline unsigned long __xchg(unsigned long val, volatile void *ptr,
 						 sizeof(*(ptr))))
 
 /*
- * On ARC700, EX insn is inherently atomic, so by default "vanilla" xchg() need
- * not require any locking. However there's a quirk.
- * ARC lacks native CMPXCHG, thus emulated (see above), using external locking -
- * incidently it "reuses" the same atomic_ops_lock used by atomic APIs.
- * Now, llist code uses cmpxchg() and xchg() on same data, so xchg() needs to
- * abide by same serializing rules, thus ends up using atomic_ops_lock as well.
+ * xchg() maps directly to ARC EX instruction which guarantees atomicity.
+ * However in !LLSC config, it also needs to be use @atomic_ops_lock spinlock
+ * due to a subtle reason:
+ *  - For !LLSC, cmpxchg() needs to use that lock (see above) and there is lot
+ *    of  kernel code which calls xchg()/cmpxchg() on same data (see llist.h)
+ *    Hence xchg() needs to follow same locking rules.
  *
- * This however is only relevant if SMP and/or ARC lacks LLSC
- *   if (UP or LLSC)
- *      xchg doesn't need serialization
- *   else <==> !(UP or LLSC) <==> (!UP and !LLSC) <==> (SMP and !LLSC)
- *      xchg needs serialization
+ * Technically the lock is also needed for UP (boils down to irq save/restore)
+ * but we can cheat a bit since cmpxchg() atomic_ops_lock() would cause irqs to
+ * be disabled thus can't possibly be interrpted/preempted/clobbered by xchg()
+ * Other way around, xchg is one instruction anyways, so can't be interrupted
+ * as such
  */
 
 #if !defined(CONFIG_ARC_HAS_LLSC) && defined(CONFIG_SMP)
diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h
index 05b5aaf5b0f9..11e1b1f3acda 100644
--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -16,18 +16,22 @@
 #include <linux/uaccess.h>
 #include <asm/errno.h>
 
+#ifdef CONFIG_ARC_HAS_LLSC
+
 #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg)\
 							\
+	smp_mb();					\
 	__asm__ __volatile__(				\
-	"1:	ld  %1, [%2]			\n"	\
+	"1:	llock	%1, [%2]		\n"	\
 		insn				"\n"	\
-	"2:	st  %0, [%2]			\n"	\
+	"2:	scond	%0, [%2]		\n"	\
+	"	bnz	1b			\n"	\
 	"	mov %0, 0			\n"	\
 	"3:					\n"	\
 	"	.section .fixup,\"ax\"		\n"	\
 	"	.align  4			\n"	\
 	"4:	mov %0, %4			\n"	\
-	"	b   3b				\n"	\
+	"	j   3b				\n"	\
 	"	.previous			\n"	\
 	"	.section __ex_table,\"a\"	\n"	\
 	"	.align  4			\n"	\
@@ -37,7 +41,37 @@
 							\
 	: "=&r" (ret), "=&r" (oldval)			\
 	: "r" (uaddr), "r" (oparg), "ir" (-EFAULT)	\
-	: "cc", "memory")
+	: "cc", "memory");				\
+	smp_mb()					\
+
+#else	/* !CONFIG_ARC_HAS_LLSC */
+
+#define __futex_atomic_op(insn, ret, oldval, uaddr, oparg)\
+							\
+	smp_mb();					\
+	__asm__ __volatile__(				\
+	"1:	ld	%1, [%2]		\n"	\
+		insn				"\n"	\
+	"2:	st	%0, [%2]		\n"	\
+	"	mov %0, 0			\n"	\
+	"3:					\n"	\
+	"	.section .fixup,\"ax\"		\n"	\
+	"	.align  4			\n"	\
+	"4:	mov %0, %4			\n"	\
+	"	j   3b				\n"	\
+	"	.previous			\n"	\
+	"	.section __ex_table,\"a\"	\n"	\
+	"	.align  4			\n"	\
+	"	.word   1b, 4b			\n"	\
+	"	.word   2b, 4b			\n"	\
+	"	.previous			\n"	\
+							\
+	: "=&r" (ret), "=&r" (oldval)			\
+	: "r" (uaddr), "r" (oparg), "ir" (-EFAULT)	\
+	: "cc", "memory");				\
+	smp_mb()					\
+
+#endif
 
 static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 {
@@ -53,6 +87,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
+#ifndef CONFIG_ARC_HAS_LLSC
+	preempt_disable();	/* to guarantee atomic r-m-w of futex op */
+#endif
 	pagefault_disable();
 
 	switch (op) {
@@ -60,6 +97,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 		__futex_atomic_op("mov %0, %3", ret, oldval, uaddr, oparg);
 		break;
 	case FUTEX_OP_ADD:
+		/* oldval = *uaddr; *uaddr += oparg ; ret = *uaddr */
 		__futex_atomic_op("add %0, %1, %3", ret, oldval, uaddr, oparg);
 		break;
 	case FUTEX_OP_OR:
@@ -76,6 +114,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	}
 
 	pagefault_enable();
+#ifndef CONFIG_ARC_HAS_LLSC
+	preempt_enable();
+#endif
 
 	if (!ret) {
 		switch (cmp) {
@@ -104,48 +145,57 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	return ret;
 }
 
-/* Compare-xchg with pagefaults disabled.
- *  Notes:
- *      -Best-Effort: Exchg happens only if compare succeeds.
- *          If compare fails, returns; leaving retry/looping to upper layers
- *      -successful cmp-xchg: return orig value in @addr (same as cmp val)
- *      -Compare fails: return orig value in @addr
- *      -user access r/w fails: return -EFAULT
+/*
+ * cmpxchg of futex (pagefaults disabled by caller)
+ * Return 0 for success, -EFAULT otherwise
  */
 static inline int
-futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
-					u32 newval)
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval,
+			      u32 newval)
 {
-	u32 val;
+	int ret = 0;
+	u32 existval;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
-	pagefault_disable();
+#ifndef CONFIG_ARC_HAS_LLSC
+	preempt_disable();	/* to guarantee atomic r-m-w of futex op */
+#endif
+	smp_mb();
 
-	/* TBD : can use llock/scond */
 	__asm__ __volatile__(
-	"1:	ld    %0, [%3]	\n"
-	"	brne  %0, %1, 3f	\n"
-	"2:	st    %2, [%3]	\n"
+#ifdef CONFIG_ARC_HAS_LLSC
+	"1:	llock	%1, [%4]		\n"
+	"	brne	%1, %2, 3f		\n"
+	"2:	scond	%3, [%4]		\n"
+	"	bnz	1b			\n"
+#else
+	"1:	ld	%1, [%4]		\n"
+	"	brne	%1, %2, 3f		\n"
+	"2:	st	%3, [%4]		\n"
+#endif
 	"3:	\n"
 	"	.section .fixup,\"ax\"	\n"
-	"4:	mov %0, %4	\n"
-	"	b   3b	\n"
+	"4:	mov %0, %5	\n"
+	"	j   3b	\n"
 	"	.previous	\n"
 	"	.section __ex_table,\"a\"	\n"
 	"	.align  4	\n"
 	"	.word   1b, 4b	\n"
 	"	.word   2b, 4b	\n"
 	"	.previous\n"
-	: "=&r"(val)
-	: "r"(oldval), "r"(newval), "r"(uaddr), "ir"(-EFAULT)
+	: "+&r"(ret), "=&r"(existval)
+	: "r"(expval), "r"(newval), "r"(uaddr), "ir"(-EFAULT)
 	: "cc", "memory");
 
-	pagefault_enable();
+	smp_mb();
 
-	*uval = val;
-	return val;
+#ifndef CONFIG_ARC_HAS_LLSC
+	preempt_enable();
+#endif
+	*uval = existval;
+	return ret;
 }
 
 #endif
diff --git a/arch/arc/include/asm/mm-arch-hooks.h b/arch/arc/include/asm/mm-arch-hooks.h
deleted file mode 100644
index c37541c5f8ba..000000000000
--- a/arch/arc/include/asm/mm-arch-hooks.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Architecture specific mm hooks
- *
- * Copyright (C) 2015, IBM Corporation
- * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef _ASM_ARC_MM_ARCH_HOOKS_H
-#define _ASM_ARC_MM_ARCH_HOOKS_H
-
-#endif /* _ASM_ARC_MM_ARCH_HOOKS_H */
diff --git a/arch/arc/include/asm/perf_event.h b/arch/arc/include/asm/perf_event.h
index 2b8880e953a2..5f071762fb1c 100644
--- a/arch/arc/include/asm/perf_event.h
+++ b/arch/arc/include/asm/perf_event.h
@@ -1,6 +1,7 @@
 /*
  * Linux performance counter support for ARC
  *
+ * Copyright (C) 2014-2015 Synopsys, Inc. (www.synopsys.com)
  * Copyright (C) 2011-2013 Synopsys, Inc. (www.synopsys.com)
  *
  * This program is free software; you can redistribute it and/or modify
@@ -12,8 +13,8 @@
 #ifndef __ASM_PERF_EVENT_H
 #define __ASM_PERF_EVENT_H
 
-/* real maximum varies per CPU, this is the maximum supported by the driver */
-#define ARC_PMU_MAX_HWEVENTS	64
+/* Max number of counters that PCT block may ever have */
+#define ARC_PERF_MAX_COUNTERS	32
 
 #define ARC_REG_CC_BUILD	0xF6
 #define ARC_REG_CC_INDEX	0x240
@@ -28,15 +29,22 @@
 #define ARC_REG_PCT_CONFIG	0x254
 #define ARC_REG_PCT_CONTROL	0x255
 #define ARC_REG_PCT_INDEX	0x256
+#define ARC_REG_PCT_INT_CNTL	0x25C
+#define ARC_REG_PCT_INT_CNTH	0x25D
+#define ARC_REG_PCT_INT_CTRL	0x25E
+#define ARC_REG_PCT_INT_ACT	0x25F
+
+#define ARC_REG_PCT_CONFIG_USER	(1 << 18)	/* count in user mode */
+#define ARC_REG_PCT_CONFIG_KERN	(1 << 19)	/* count in kernel mode */
 
 #define ARC_REG_PCT_CONTROL_CC	(1 << 16)	/* clear counts */
 #define ARC_REG_PCT_CONTROL_SN	(1 << 17)	/* snapshot */
 
 struct arc_reg_pct_build {
 #ifdef CONFIG_CPU_BIG_ENDIAN
-	unsigned int m:8, c:8, r:6, s:2, v:8;
+	unsigned int m:8, c:8, r:5, i:1, s:2, v:8;
 #else
-	unsigned int v:8, s:2, r:6, c:8, m:8;
+	unsigned int v:8, s:2, i:1, r:5, c:8, m:8;
 #endif
 };
 
@@ -95,10 +103,13 @@ static const char * const arc_pmu_ev_hw_map[] = {
 
 	/* counts condition */
 	[PERF_COUNT_HW_INSTRUCTIONS] = "iall",
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmp",
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmp", /* Excludes ZOL jumps */
 	[PERF_COUNT_ARC_BPOK]         = "bpok",	  /* NP-NT, PT-T, PNT-NT */
+#ifdef CONFIG_ISA_ARCV2
+	[PERF_COUNT_HW_BRANCH_MISSES] = "bpmp",
+#else
 	[PERF_COUNT_HW_BRANCH_MISSES] = "bpfail", /* NP-T, PT-NT, PNT-T */
-
+#endif
 	[PERF_COUNT_ARC_LDC] = "imemrdc",	/* Instr: mem read cached */
 	[PERF_COUNT_ARC_STC] = "imemwrc",	/* Instr: mem write cached */
 
diff --git a/arch/arc/include/asm/ptrace.h b/arch/arc/include/asm/ptrace.h
index 91755972b9a2..69095da1fcfd 100644
--- a/arch/arc/include/asm/ptrace.h
+++ b/arch/arc/include/asm/ptrace.h
@@ -20,20 +20,20 @@
 struct pt_regs {
 
 	/* Real registers */
-	long bta;	/* bta_l1, bta_l2, erbta */
+	unsigned long bta;	/* bta_l1, bta_l2, erbta */
 
-	long lp_start, lp_end, lp_count;
+	unsigned long lp_start, lp_end, lp_count;
 
-	long status32;	/* status32_l1, status32_l2, erstatus */
-	long ret;	/* ilink1, ilink2 or eret */
-	long blink;
-	long fp;
-	long r26;	/* gp */
+	unsigned long status32;	/* status32_l1, status32_l2, erstatus */
+	unsigned long ret;	/* ilink1, ilink2 or eret */
+	unsigned long blink;
+	unsigned long fp;
+	unsigned long r26;	/* gp */
 
-	long r12, r11, r10, r9, r8, r7, r6, r5, r4, r3, r2, r1, r0;
+	unsigned long r12, r11, r10, r9, r8, r7, r6, r5, r4, r3, r2, r1, r0;
 
-	long sp;	/* user/kernel sp depending on where we came from  */
-	long orig_r0;
+	unsigned long sp;	/* User/Kernel depending on where we came from */
+	unsigned long orig_r0;
 
 	/*
 	 * To distinguish bet excp, syscall, irq
@@ -55,13 +55,13 @@ struct pt_regs {
 		unsigned long event;
 	};
 
-	long user_r25;
+	unsigned long user_r25;
 };
 #else
 
 struct pt_regs {
 
-	long orig_r0;
+	unsigned long orig_r0;
 
 	union {
 		struct {
@@ -76,26 +76,26 @@ struct pt_regs {
 		unsigned long event;
 	};
 
-	long bta;	/* bta_l1, bta_l2, erbta */
+	unsigned long bta;	/* bta_l1, bta_l2, erbta */
 
-	long user_r25;
+	unsigned long user_r25;
 
-	long r26;	/* gp */
-	long fp;
-	long sp;	/* user/kernel sp depending on where we came from  */
+	unsigned long r26;	/* gp */
+	unsigned long fp;
+	unsigned long sp;	/* user/kernel sp depending on where we came from  */
 
-	long r12;
+	unsigned long r12;
 
 	/*------- Below list auto saved by h/w -----------*/
-	long r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;
+	unsigned long r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;
 
-	long blink;
-	long lp_end, lp_start, lp_count;
+	unsigned long blink;
+	unsigned long lp_end, lp_start, lp_count;
 
-	long ei, ldi, jli;
+	unsigned long ei, ldi, jli;
 
-	long ret;
-	long status32;
+	unsigned long ret;
+	unsigned long status32;
 };
 
 #endif
@@ -103,7 +103,7 @@ struct pt_regs {
 /* Callee saved registers - need to be saved only when you are scheduled out */
 
 struct callee_regs {
-	long r25, r24, r23, r22, r21, r20, r19, r18, r17, r16, r15, r14, r13;
+	unsigned long r25, r24, r23, r22, r21, r20, r19, r18, r17, r16, r15, r14, r13;
 };
 
 #define instruction_pointer(regs)	((regs)->ret)
@@ -142,7 +142,7 @@ struct callee_regs {
 
 static inline long regs_return_value(struct pt_regs *regs)
 {
-	return regs->r0;
+	return (long)regs->r0;
 }
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/arc/include/asm/spinlock.h b/arch/arc/include/asm/spinlock.h
index e1651df6a93d..db8c59d1eaeb 100644
--- a/arch/arc/include/asm/spinlock.h
+++ b/arch/arc/include/asm/spinlock.h
@@ -18,9 +18,518 @@
 #define arch_spin_unlock_wait(x) \
 	do { while (arch_spin_is_locked(x)) cpu_relax(); } while (0)
 
+#ifdef CONFIG_ARC_HAS_LLSC
+
+/*
+ * A normal LLOCK/SCOND based system, w/o need for livelock workaround
+ */
+#ifndef CONFIG_ARC_STAR_9000923308
+
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
-	unsigned int tmp = __ARCH_SPIN_LOCK_LOCKED__;
+	unsigned int val;
+
+	smp_mb();
+
+	__asm__ __volatile__(
+	"1:	llock	%[val], [%[slock]]	\n"
+	"	breq	%[val], %[LOCKED], 1b	\n"	/* spin while LOCKED */
+	"	scond	%[LOCKED], [%[slock]]	\n"	/* acquire */
+	"	bnz	1b			\n"
+	"					\n"
+	: [val]		"=&r"	(val)
+	: [slock]	"r"	(&(lock->slock)),
+	  [LOCKED]	"r"	(__ARCH_SPIN_LOCK_LOCKED__)
+	: "memory", "cc");
+
+	smp_mb();
+}
+
+/* 1 - lock taken successfully */
+static inline int arch_spin_trylock(arch_spinlock_t *lock)
+{
+	unsigned int val, got_it = 0;
+
+	smp_mb();
+
+	__asm__ __volatile__(
+	"1:	llock	%[val], [%[slock]]	\n"
+	"	breq	%[val], %[LOCKED], 4f	\n"	/* already LOCKED, just bail */
+	"	scond	%[LOCKED], [%[slock]]	\n"	/* acquire */
+	"	bnz	1b			\n"
+	"	mov	%[got_it], 1		\n"
+	"4:					\n"
+	"					\n"
+	: [val]		"=&r"	(val),
+	  [got_it]	"+&r"	(got_it)
+	: [slock]	"r"	(&(lock->slock)),
+	  [LOCKED]	"r"	(__ARCH_SPIN_LOCK_LOCKED__)
+	: "memory", "cc");
+
+	smp_mb();
+
+	return got_it;
+}
+
+static inline void arch_spin_unlock(arch_spinlock_t *lock)
+{
+	smp_mb();
+
+	lock->slock = __ARCH_SPIN_LOCK_UNLOCKED__;
+
+	smp_mb();
+}
+
+/*
+ * Read-write spinlocks, allowing multiple readers but only one writer.
+ * Unfair locking as Writers could be starved indefinitely by Reader(s)
+ */
+
+static inline void arch_read_lock(arch_rwlock_t *rw)
+{
+	unsigned int val;
+
+	smp_mb();
+
+	/*
+	 * zero means writer holds the lock exclusively, deny Reader.
+	 * Otherwise grant lock to first/subseq reader
+	 *
+	 * 	if (rw->counter > 0) {
+	 *		rw->counter--;
+	 *		ret = 1;
+	 *	}
+	 */
+
+	__asm__ __volatile__(
+	"1:	llock	%[val], [%[rwlock]]	\n"
+	"	brls	%[val], %[WR_LOCKED], 1b\n"	/* <= 0: spin while write locked */
+	"	sub	%[val], %[val], 1	\n"	/* reader lock */
+	"	scond	%[val], [%[rwlock]]	\n"
+	"	bnz	1b			\n"
+	"					\n"
+	: [val]		"=&r"	(val)
+	: [rwlock]	"r"	(&(rw->counter)),
+	  [WR_LOCKED]	"ir"	(0)
+	: "memory", "cc");
+
+	smp_mb();
+}
+
+/* 1 - lock taken successfully */
+static inline int arch_read_trylock(arch_rwlock_t *rw)
+{
+	unsigned int val, got_it = 0;
+
+	smp_mb();
+
+	__asm__ __volatile__(
+	"1:	llock	%[val], [%[rwlock]]	\n"
+	"	brls	%[val], %[WR_LOCKED], 4f\n"	/* <= 0: already write locked, bail */
+	"	sub	%[val], %[val], 1	\n"	/* counter-- */
+	"	scond	%[val], [%[rwlock]]	\n"
+	"	bnz	1b			\n"	/* retry if collided with someone */
+	"	mov	%[got_it], 1		\n"
+	"					\n"
+	"4: ; --- done ---			\n"
+
+	: [val]		"=&r"	(val),
+	  [got_it]	"+&r"	(got_it)
+	: [rwlock]	"r"	(&(rw->counter)),
+	  [WR_LOCKED]	"ir"	(0)
+	: "memory", "cc");
+
+	smp_mb();
+
+	return got_it;
+}
+
+static inline void arch_write_lock(arch_rwlock_t *rw)
+{
+	unsigned int val;
+
+	smp_mb();
+
+	/*
+	 * If reader(s) hold lock (lock < __ARCH_RW_LOCK_UNLOCKED__),
+	 * deny writer. Otherwise if unlocked grant to writer
+	 * Hence the claim that Linux rwlocks are unfair to writers.
+	 * (can be starved for an indefinite time by readers).
+	 *
+	 *	if (rw->counter == __ARCH_RW_LOCK_UNLOCKED__) {
+	 *		rw->counter = 0;
+	 *		ret = 1;
+	 *	}
+	 */
+
+	__asm__ __volatile__(
+	"1:	llock	%[val], [%[rwlock]]	\n"
+	"	brne	%[val], %[UNLOCKED], 1b	\n"	/* while !UNLOCKED spin */
+	"	mov	%[val], %[WR_LOCKED]	\n"
+	"	scond	%[val], [%[rwlock]]	\n"
+	"	bnz	1b			\n"
+	"					\n"
+	: [val]		"=&r"	(val)
+	: [rwlock]	"r"	(&(rw->counter)),
+	  [UNLOCKED]	"ir"	(__ARCH_RW_LOCK_UNLOCKED__),
+	  [WR_LOCKED]	"ir"	(0)
+	: "memory", "cc");
+
+	smp_mb();
+}
+
+/* 1 - lock taken successfully */
+static inline int arch_write_trylock(arch_rwlock_t *rw)
+{
+	unsigned int val, got_it = 0;
+
+	smp_mb();
+
+	__asm__ __volatile__(
+	"1:	llock	%[val], [%[rwlock]]	\n"
+	"	brne	%[val], %[UNLOCKED], 4f	\n"	/* !UNLOCKED, bail */
+	"	mov	%[val], %[WR_LOCKED]	\n"
+	"	scond	%[val], [%[rwlock]]	\n"
+	"	bnz	1b			\n"	/* retry if collided with someone */
+	"	mov	%[got_it], 1		\n"
+	"					\n"
+	"4: ; --- done ---			\n"
+
+	: [val]		"=&r"	(val),
+	  [got_it]	"+&r"	(got_it)
+	: [rwlock]	"r"	(&(rw->counter)),
+	  [UNLOCKED]	"ir"	(__ARCH_RW_LOCK_UNLOCKED__),
+	  [WR_LOCKED]	"ir"	(0)
+	: "memory", "cc");
+
+	smp_mb();
+
+	return got_it;
+}
+
+static inline void arch_read_unlock(arch_rwlock_t *rw)
+{
+	unsigned int val;
+
+	smp_mb();
+
+	/*
+	 * rw->counter++;
+	 */
+	__asm__ __volatile__(
+	"1:	llock	%[val], [%[rwlock]]	\n"
+	"	add	%[val], %[val], 1	\n"
+	"	scond	%[val], [%[rwlock]]	\n"
+	"	bnz	1b			\n"
+	"					\n"
+	: [val]		"=&r"	(val)
+	: [rwlock]	"r"	(&(rw->counter))
+	: "memory", "cc");
+
+	smp_mb();
+}
+
+static inline void arch_write_unlock(arch_rwlock_t *rw)
+{
+	smp_mb();
+
+	rw->counter = __ARCH_RW_LOCK_UNLOCKED__;
+
+	smp_mb();
+}
+
+#else	/* CONFIG_ARC_STAR_9000923308 */
+
+/*
+ * HS38x4 could get into a LLOCK/SCOND livelock in case of multiple overlapping
+ * coherency transactions in the SCU. The exclusive line state keeps rotating
+ * among contenting cores leading to a never ending cycle. So break the cycle
+ * by deferring the retry of failed exclusive access (SCOND). The actual delay
+ * needed is function of number of contending cores as well as the unrelated
+ * coherency traffic from other cores. To keep the code simple, start off with
+ * small delay of 1 which would suffice most cases and in case of contention
+ * double the delay. Eventually the delay is sufficient such that the coherency
+ * pipeline is drained, thus a subsequent exclusive access would succeed.
+ */
+
+#define SCOND_FAIL_RETRY_VAR_DEF						\
+	unsigned int delay, tmp;						\
+
+#define SCOND_FAIL_RETRY_ASM							\
+	"   ; --- scond fail delay ---		\n"				\
+	"	mov	%[tmp], %[delay]	\n"	/* tmp = delay */	\
+	"2: 	brne.d	%[tmp], 0, 2b		\n"	/* while (tmp != 0) */	\
+	"	sub	%[tmp], %[tmp], 1	\n"	/* tmp-- */		\
+	"	rol	%[delay], %[delay]	\n"	/* delay *= 2 */	\
+	"	b	1b			\n"	/* start over */	\
+	"					\n"				\
+	"4: ; --- done ---			\n"				\
+
+#define SCOND_FAIL_RETRY_VARS							\
+	  ,[delay] "=&r" (delay), [tmp] "=&r"	(tmp)				\
+
+static inline void arch_spin_lock(arch_spinlock_t *lock)
+{
+	unsigned int val;
+	SCOND_FAIL_RETRY_VAR_DEF;
+
+	smp_mb();
+
+	__asm__ __volatile__(
+	"0:	mov	%[delay], 1		\n"
+	"1:	llock	%[val], [%[slock]]	\n"
+	"	breq	%[val], %[LOCKED], 0b	\n"	/* spin while LOCKED */
+	"	scond	%[LOCKED], [%[slock]]	\n"	/* acquire */
+	"	bz	4f			\n"	/* done */
+	"					\n"
+	SCOND_FAIL_RETRY_ASM
+
+	: [val]		"=&r"	(val)
+	  SCOND_FAIL_RETRY_VARS
+	: [slock]	"r"	(&(lock->slock)),
+	  [LOCKED]	"r"	(__ARCH_SPIN_LOCK_LOCKED__)
+	: "memory", "cc");
+
+	smp_mb();
+}
+
+/* 1 - lock taken successfully */
+static inline int arch_spin_trylock(arch_spinlock_t *lock)
+{
+	unsigned int val, got_it = 0;
+	SCOND_FAIL_RETRY_VAR_DEF;
+
+	smp_mb();
+
+	__asm__ __volatile__(
+	"0:	mov	%[delay], 1		\n"
+	"1:	llock	%[val], [%[slock]]	\n"
+	"	breq	%[val], %[LOCKED], 4f	\n"	/* already LOCKED, just bail */
+	"	scond	%[LOCKED], [%[slock]]	\n"	/* acquire */
+	"	bz.d	4f			\n"
+	"	mov.z	%[got_it], 1		\n"	/* got it */
+	"					\n"
+	SCOND_FAIL_RETRY_ASM
+
+	: [val]		"=&r"	(val),
+	  [got_it]	"+&r"	(got_it)
+	  SCOND_FAIL_RETRY_VARS
+	: [slock]	"r"	(&(lock->slock)),
+	  [LOCKED]	"r"	(__ARCH_SPIN_LOCK_LOCKED__)
+	: "memory", "cc");
+
+	smp_mb();
+
+	return got_it;
+}
+
+static inline void arch_spin_unlock(arch_spinlock_t *lock)
+{
+	smp_mb();
+
+	lock->slock = __ARCH_SPIN_LOCK_UNLOCKED__;
+
+	smp_mb();
+}
+
+/*
+ * Read-write spinlocks, allowing multiple readers but only one writer.
+ * Unfair locking as Writers could be starved indefinitely by Reader(s)
+ */
+
+static inline void arch_read_lock(arch_rwlock_t *rw)
+{
+	unsigned int val;
+	SCOND_FAIL_RETRY_VAR_DEF;
+
+	smp_mb();
+
+	/*
+	 * zero means writer holds the lock exclusively, deny Reader.
+	 * Otherwise grant lock to first/subseq reader
+	 *
+	 * 	if (rw->counter > 0) {
+	 *		rw->counter--;
+	 *		ret = 1;
+	 *	}
+	 */
+
+	__asm__ __volatile__(
+	"0:	mov	%[delay], 1		\n"
+	"1:	llock	%[val], [%[rwlock]]	\n"
+	"	brls	%[val], %[WR_LOCKED], 0b\n"	/* <= 0: spin while write locked */
+	"	sub	%[val], %[val], 1	\n"	/* reader lock */
+	"	scond	%[val], [%[rwlock]]	\n"
+	"	bz	4f			\n"	/* done */
+	"					\n"
+	SCOND_FAIL_RETRY_ASM
+
+	: [val]		"=&r"	(val)
+	  SCOND_FAIL_RETRY_VARS
+	: [rwlock]	"r"	(&(rw->counter)),
+	  [WR_LOCKED]	"ir"	(0)
+	: "memory", "cc");
+
+	smp_mb();
+}
+
+/* 1 - lock taken successfully */
+static inline int arch_read_trylock(arch_rwlock_t *rw)
+{
+	unsigned int val, got_it = 0;
+	SCOND_FAIL_RETRY_VAR_DEF;
+
+	smp_mb();
+
+	__asm__ __volatile__(
+	"0:	mov	%[delay], 1		\n"
+	"1:	llock	%[val], [%[rwlock]]	\n"
+	"	brls	%[val], %[WR_LOCKED], 4f\n"	/* <= 0: already write locked, bail */
+	"	sub	%[val], %[val], 1	\n"	/* counter-- */
+	"	scond	%[val], [%[rwlock]]	\n"
+	"	bz.d	4f			\n"
+	"	mov.z	%[got_it], 1		\n"	/* got it */
+	"					\n"
+	SCOND_FAIL_RETRY_ASM
+
+	: [val]		"=&r"	(val),
+	  [got_it]	"+&r"	(got_it)
+	  SCOND_FAIL_RETRY_VARS
+	: [rwlock]	"r"	(&(rw->counter)),
+	  [WR_LOCKED]	"ir"	(0)
+	: "memory", "cc");
+
+	smp_mb();
+
+	return got_it;
+}
+
+static inline void arch_write_lock(arch_rwlock_t *rw)
+{
+	unsigned int val;
+	SCOND_FAIL_RETRY_VAR_DEF;
+
+	smp_mb();
+
+	/*
+	 * If reader(s) hold lock (lock < __ARCH_RW_LOCK_UNLOCKED__),
+	 * deny writer. Otherwise if unlocked grant to writer
+	 * Hence the claim that Linux rwlocks are unfair to writers.
+	 * (can be starved for an indefinite time by readers).
+	 *
+	 *	if (rw->counter == __ARCH_RW_LOCK_UNLOCKED__) {
+	 *		rw->counter = 0;
+	 *		ret = 1;
+	 *	}
+	 */
+
+	__asm__ __volatile__(
+	"0:	mov	%[delay], 1		\n"
+	"1:	llock	%[val], [%[rwlock]]	\n"
+	"	brne	%[val], %[UNLOCKED], 0b	\n"	/* while !UNLOCKED spin */
+	"	mov	%[val], %[WR_LOCKED]	\n"
+	"	scond	%[val], [%[rwlock]]	\n"
+	"	bz	4f			\n"
+	"					\n"
+	SCOND_FAIL_RETRY_ASM
+
+	: [val]		"=&r"	(val)
+	  SCOND_FAIL_RETRY_VARS
+	: [rwlock]	"r"	(&(rw->counter)),
+	  [UNLOCKED]	"ir"	(__ARCH_RW_LOCK_UNLOCKED__),
+	  [WR_LOCKED]	"ir"	(0)
+	: "memory", "cc");
+
+	smp_mb();
+}
+
+/* 1 - lock taken successfully */
+static inline int arch_write_trylock(arch_rwlock_t *rw)
+{
+	unsigned int val, got_it = 0;
+	SCOND_FAIL_RETRY_VAR_DEF;
+
+	smp_mb();
+
+	__asm__ __volatile__(
+	"0:	mov	%[delay], 1		\n"
+	"1:	llock	%[val], [%[rwlock]]	\n"
+	"	brne	%[val], %[UNLOCKED], 4f	\n"	/* !UNLOCKED, bail */
+	"	mov	%[val], %[WR_LOCKED]	\n"
+	"	scond	%[val], [%[rwlock]]	\n"
+	"	bz.d	4f			\n"
+	"	mov.z	%[got_it], 1		\n"	/* got it */
+	"					\n"
+	SCOND_FAIL_RETRY_ASM
+
+	: [val]		"=&r"	(val),
+	  [got_it]	"+&r"	(got_it)
+	  SCOND_FAIL_RETRY_VARS
+	: [rwlock]	"r"	(&(rw->counter)),
+	  [UNLOCKED]	"ir"	(__ARCH_RW_LOCK_UNLOCKED__),
+	  [WR_LOCKED]	"ir"	(0)
+	: "memory", "cc");
+
+	smp_mb();
+
+	return got_it;
+}
+
+static inline void arch_read_unlock(arch_rwlock_t *rw)
+{
+	unsigned int val;
+
+	smp_mb();
+
+	/*
+	 * rw->counter++;
+	 */
+	__asm__ __volatile__(
+	"1:	llock	%[val], [%[rwlock]]	\n"
+	"	add	%[val], %[val], 1	\n"
+	"	scond	%[val], [%[rwlock]]	\n"
+	"	bnz	1b			\n"
+	"					\n"
+	: [val]		"=&r"	(val)
+	: [rwlock]	"r"	(&(rw->counter))
+	: "memory", "cc");
+
+	smp_mb();
+}
+
+static inline void arch_write_unlock(arch_rwlock_t *rw)
+{
+	unsigned int val;
+
+	smp_mb();
+
+	/*
+	 * rw->counter = __ARCH_RW_LOCK_UNLOCKED__;
+	 */
+	__asm__ __volatile__(
+	"1:	llock	%[val], [%[rwlock]]	\n"
+	"	scond	%[UNLOCKED], [%[rwlock]]\n"
+	"	bnz	1b			\n"
+	"					\n"
+	: [val]		"=&r"	(val)
+	: [rwlock]	"r"	(&(rw->counter)),
+	  [UNLOCKED]	"r"	(__ARCH_RW_LOCK_UNLOCKED__)
+	: "memory", "cc");
+
+	smp_mb();
+}
+
+#undef SCOND_FAIL_RETRY_VAR_DEF
+#undef SCOND_FAIL_RETRY_ASM
+#undef SCOND_FAIL_RETRY_VARS
+
+#endif	/* CONFIG_ARC_STAR_9000923308 */
+
+#else	/* !CONFIG_ARC_HAS_LLSC */
+
+static inline void arch_spin_lock(arch_spinlock_t *lock)
+{
+	unsigned int val = __ARCH_SPIN_LOCK_LOCKED__;
 
 	/*
 	 * This smp_mb() is technically superfluous, we only need the one
@@ -33,7 +542,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
 	__asm__ __volatile__(
 	"1:	ex  %0, [%1]		\n"
 	"	breq  %0, %2, 1b	\n"
-	: "+&r" (tmp)
+	: "+&r" (val)
 	: "r"(&(lock->slock)), "ir"(__ARCH_SPIN_LOCK_LOCKED__)
 	: "memory");
 
@@ -48,26 +557,27 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
 	smp_mb();
 }
 
+/* 1 - lock taken successfully */
 static inline int arch_spin_trylock(arch_spinlock_t *lock)
 {
-	unsigned int tmp = __ARCH_SPIN_LOCK_LOCKED__;
+	unsigned int val = __ARCH_SPIN_LOCK_LOCKED__;
 
 	smp_mb();
 
 	__asm__ __volatile__(
 	"1:	ex  %0, [%1]		\n"
-	: "+r" (tmp)
+	: "+r" (val)
 	: "r"(&(lock->slock))
 	: "memory");
 
 	smp_mb();
 
-	return (tmp == __ARCH_SPIN_LOCK_UNLOCKED__);
+	return (val == __ARCH_SPIN_LOCK_UNLOCKED__);
 }
 
 static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
-	unsigned int tmp = __ARCH_SPIN_LOCK_UNLOCKED__;
+	unsigned int val = __ARCH_SPIN_LOCK_UNLOCKED__;
 
 	/*
 	 * RELEASE barrier: given the instructions avail on ARCv2, full barrier
@@ -77,7 +587,7 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 
 	__asm__ __volatile__(
 	"	ex  %0, [%1]		\n"
-	: "+r" (tmp)
+	: "+r" (val)
 	: "r"(&(lock->slock))
 	: "memory");
 
@@ -90,19 +600,12 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 
 /*
  * Read-write spinlocks, allowing multiple readers but only one writer.
+ * Unfair locking as Writers could be starved indefinitely by Reader(s)
  *
  * The spinlock itself is contained in @counter and access to it is
  * serialized with @lock_mutex.
- *
- * Unfair locking as Writers could be starved indefinitely by Reader(s)
  */
 
-/* Would read_trylock() succeed? */
-#define arch_read_can_lock(x)	((x)->counter > 0)
-
-/* Would write_trylock() succeed? */
-#define arch_write_can_lock(x)	((x)->counter == __ARCH_RW_LOCK_UNLOCKED__)
-
 /* 1 - lock taken successfully */
 static inline int arch_read_trylock(arch_rwlock_t *rw)
 {
@@ -173,6 +676,11 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 	arch_spin_unlock(&(rw->lock_mutex));
 }
 
+#endif
+
+#define arch_read_can_lock(x)	((x)->counter > 0)
+#define arch_write_can_lock(x)	((x)->counter == __ARCH_RW_LOCK_UNLOCKED__)
+
 #define arch_read_lock_flags(lock, flags)	arch_read_lock(lock)
 #define arch_write_lock_flags(lock, flags)	arch_write_lock(lock)
 
diff --git a/arch/arc/include/asm/spinlock_types.h b/arch/arc/include/asm/spinlock_types.h
index 662627ced4f2..4e1ef5f650c6 100644
--- a/arch/arc/include/asm/spinlock_types.h
+++ b/arch/arc/include/asm/spinlock_types.h
@@ -26,7 +26,9 @@ typedef struct {
  */
 typedef struct {
 	volatile unsigned int	counter;
+#ifndef CONFIG_ARC_HAS_LLSC
 	arch_spinlock_t		lock_mutex;
+#endif
 } arch_rwlock_t;
 
 #define __ARCH_RW_LOCK_UNLOCKED__	0x01000000
diff --git a/arch/arc/include/uapi/asm/ptrace.h b/arch/arc/include/uapi/asm/ptrace.h
index 76a7739aab1c..0b3ef63d4a03 100644
--- a/arch/arc/include/uapi/asm/ptrace.h
+++ b/arch/arc/include/uapi/asm/ptrace.h
@@ -32,20 +32,20 @@
 */
 struct user_regs_struct {
 
-	long pad;
+	unsigned long pad;
 	struct {
-		long bta, lp_start, lp_end, lp_count;
-		long status32, ret, blink, fp, gp;
-		long r12, r11, r10, r9, r8, r7, r6, r5, r4, r3, r2, r1, r0;
-		long sp;
+		unsigned long bta, lp_start, lp_end, lp_count;
+		unsigned long status32, ret, blink, fp, gp;
+		unsigned long r12, r11, r10, r9, r8, r7, r6, r5, r4, r3, r2, r1, r0;
+		unsigned long sp;
 	} scratch;
-	long pad2;
+	unsigned long pad2;
 	struct {
-		long r25, r24, r23, r22, r21, r20;
-		long r19, r18, r17, r16, r15, r14, r13;
+		unsigned long r25, r24, r23, r22, r21, r20;
+		unsigned long r19, r18, r17, r16, r15, r14, r13;
 	} callee;
-	long efa;	/* break pt addr, for break points in delay slots */
-	long stop_pc;	/* give dbg stop_pc after ensuring brkpt trap */
+	unsigned long efa;	/* break pt addr, for break points in delay slots */
+	unsigned long stop_pc;	/* give dbg stop_pc after ensuring brkpt trap */
 };
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/arc/kernel/entry-arcv2.S b/arch/arc/kernel/entry-arcv2.S
index bd7105d3172f..8fa76567e402 100644
--- a/arch/arc/kernel/entry-arcv2.S
+++ b/arch/arc/kernel/entry-arcv2.S
@@ -57,13 +57,8 @@ VECTOR	handle_interrupt	; (23) End of fixed IRQs
 
 	.section .text, "ax",@progbits
 
-res_service:		; processor restart
-	flag    0x1     ; not implemented
-	nop
-	nop
-
-reserved:		; processor restart
-	rtie            ; jump to processor initializations
+reserved:
+	flag 1		; Unexpected event, halt
 
 ;##################### Interrupt Handling ##############################
 
diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S
index f7a82fd4d601..589abf5172d6 100644
--- a/arch/arc/kernel/entry.S
+++ b/arch/arc/kernel/entry.S
@@ -42,7 +42,7 @@ ENTRY(ret_from_fork)
 	; when the forked child comes here from the __switch_to function
 	; r0 has the last task pointer.
 	; put last task in scheduler queue
-	bl   @schedule_tail
+	jl   @schedule_tail
 
 	ld   r9, [sp, PT_status32]
 	brne r9, 0, 1f
@@ -320,7 +320,7 @@ resume_user_mode_begin:
 	; --- (Slow Path #1) task preemption ---
 	bbit0  r9, TIF_NEED_RESCHED, .Lchk_pend_signals
 	mov    blink, resume_user_mode_begin  ; tail-call to U mode ret chks
-	b      @schedule 	; BTST+Bnz causes relo error in link
+	j      @schedule 	; BTST+Bnz causes relo error in link
 
 .Lchk_pend_signals:
 	IRQ_ENABLE	r10
@@ -381,7 +381,7 @@ resume_kernel_mode:
 	bbit0  r9, TIF_NEED_RESCHED, .Lrestore_regs
 
 	; Invoke PREEMPTION
-	bl      preempt_schedule_irq
+	jl      preempt_schedule_irq
 
 	; preempt_schedule_irq() always returns with IRQ disabled
 #endif
diff --git a/arch/arc/kernel/intc-arcv2.c b/arch/arc/kernel/intc-arcv2.c
index 6208c630abed..26c156827479 100644
--- a/arch/arc/kernel/intc-arcv2.c
+++ b/arch/arc/kernel/intc-arcv2.c
@@ -12,7 +12,6 @@
 #include <linux/of.h>
 #include <linux/irqdomain.h>
 #include <linux/irqchip.h>
-#include "../../drivers/irqchip/irqchip.h"
 #include <asm/irq.h>
 
 /*
diff --git a/arch/arc/kernel/intc-compact.c b/arch/arc/kernel/intc-compact.c
index fcdddb631766..039fac30b5c1 100644
--- a/arch/arc/kernel/intc-compact.c
+++ b/arch/arc/kernel/intc-compact.c
@@ -12,7 +12,6 @@
 #include <linux/of.h>
 #include <linux/irqdomain.h>
 #include <linux/irqchip.h>
-#include "../../drivers/irqchip/irqchip.h"
 #include <asm/irq.h>
 
 /*
diff --git a/arch/arc/kernel/mcip.c b/arch/arc/kernel/mcip.c
index 30284e8de6ff..d9e44b62df05 100644
--- a/arch/arc/kernel/mcip.c
+++ b/arch/arc/kernel/mcip.c
@@ -175,7 +175,6 @@ void mcip_init_early_smp(void)
 #include <linux/irqchip.h>
 #include <linux/of.h>
 #include <linux/of_irq.h>
-#include "../../drivers/irqchip/irqchip.h"
 
 /*
  * Set the DEST for @cmn_irq to @cpu_mask (1 bit per core)
@@ -218,11 +217,28 @@ static void idu_irq_unmask(struct irq_data *data)
 	raw_spin_unlock_irqrestore(&mcip_lock, flags);
 }
 
+#ifdef CONFIG_SMP
 static int
-idu_irq_set_affinity(struct irq_data *d, const struct cpumask *cpumask, bool f)
+idu_irq_set_affinity(struct irq_data *data, const struct cpumask *cpumask,
+		     bool force)
 {
+	unsigned long flags;
+	cpumask_t online;
+
+	/* errout if no online cpu per @cpumask */
+	if (!cpumask_and(&online, cpumask, cpu_online_mask))
+		return -EINVAL;
+
+	raw_spin_lock_irqsave(&mcip_lock, flags);
+
+	idu_set_dest(data->hwirq, cpumask_bits(&online)[0]);
+	idu_set_mode(data->hwirq, IDU_M_TRIG_LEVEL, IDU_M_DISTRI_RR);
+
+	raw_spin_unlock_irqrestore(&mcip_lock, flags);
+
 	return IRQ_SET_MASK_OK;
 }
+#endif
 
 static struct irq_chip idu_irq_chip = {
 	.name			= "MCIP IDU Intc",
@@ -236,9 +252,10 @@ static struct irq_chip idu_irq_chip = {
 
 static int idu_first_irq;
 
-static void idu_cascade_isr(unsigned int core_irq, struct irq_desc *desc)
+static void idu_cascade_isr(unsigned int __core_irq, struct irq_desc *desc)
 {
 	struct irq_domain *domain = irq_desc_get_handler_data(desc);
+	unsigned int core_irq = irq_desc_get_irq(desc);
 	unsigned int idu_irq;
 
 	idu_irq = core_irq - idu_first_irq;
@@ -330,8 +347,7 @@ idu_of_init(struct device_node *intc, struct device_node *parent)
 		if (!i)
 			idu_first_irq = irq;
 
-		irq_set_handler_data(irq, domain);
-		irq_set_chained_handler(irq, idu_cascade_isr);
+		irq_set_chained_handler_and_data(irq, idu_cascade_isr, domain);
 	}
 
 	__mcip_cmd(CMD_IDU_ENABLE, 0);
diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c
index 1287388c258a..0c08bb1ce15a 100644
--- a/arch/arc/kernel/perf_event.c
+++ b/arch/arc/kernel/perf_event.c
@@ -1,7 +1,7 @@
 /*
  * Linux performance counter support for ARC700 series
  *
- * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2013-2015 Synopsys, Inc. (www.synopsys.com)
  *
  * This code is inspired by the perf support of various other architectures.
  *
@@ -11,6 +11,7 @@
  *
  */
 #include <linux/errno.h>
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/perf_event.h>
@@ -20,12 +21,25 @@
 
 struct arc_pmu {
 	struct pmu	pmu;
-	int		counter_size;	/* in bits */
+	unsigned int	irq;
 	int		n_counters;
-	unsigned long	used_mask[BITS_TO_LONGS(ARC_PMU_MAX_HWEVENTS)];
+	u64		max_period;
 	int		ev_hw_idx[PERF_COUNT_ARC_HW_MAX];
 };
 
+struct arc_pmu_cpu {
+	/*
+	 * A 1 bit for an index indicates that the counter is being used for
+	 * an event. A 0 means that the counter can be used.
+	 */
+	unsigned long	used_mask[BITS_TO_LONGS(ARC_PERF_MAX_COUNTERS)];
+
+	/*
+	 * The events that are active on the PMU for the given index.
+	 */
+	struct perf_event *act_counter[ARC_PERF_MAX_COUNTERS];
+};
+
 struct arc_callchain_trace {
 	int depth;
 	void *perf_stuff;
@@ -65,6 +79,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 }
 
 static struct arc_pmu *arc_pmu;
+static DEFINE_PER_CPU(struct arc_pmu_cpu, arc_pmu_cpu);
 
 /* read counter #idx; note that counter# != event# on ARC! */
 static uint64_t arc_pmu_read_counter(int idx)
@@ -88,18 +103,15 @@ static uint64_t arc_pmu_read_counter(int idx)
 static void arc_perf_event_update(struct perf_event *event,
 				  struct hw_perf_event *hwc, int idx)
 {
-	uint64_t prev_raw_count, new_raw_count;
-	int64_t delta;
-
-	do {
-		prev_raw_count = local64_read(&hwc->prev_count);
-		new_raw_count = arc_pmu_read_counter(idx);
-	} while (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-				 new_raw_count) != prev_raw_count);
-
-	delta = (new_raw_count - prev_raw_count) &
-		((1ULL << arc_pmu->counter_size) - 1ULL);
+	uint64_t prev_raw_count = local64_read(&hwc->prev_count);
+	uint64_t new_raw_count = arc_pmu_read_counter(idx);
+	int64_t delta = new_raw_count - prev_raw_count;
 
+	/*
+	 * We don't afaraid of hwc->prev_count changing beneath our feet
+	 * because there's no way for us to re-enter this function anytime.
+	 */
+	local64_set(&hwc->prev_count, new_raw_count);
 	local64_add(delta, &event->count);
 	local64_sub(delta, &hwc->period_left);
 }
@@ -142,22 +154,41 @@ static int arc_pmu_event_init(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	int ret;
 
+	if (!is_sampling_event(event)) {
+		hwc->sample_period  = arc_pmu->max_period;
+		hwc->last_period = hwc->sample_period;
+		local64_set(&hwc->period_left, hwc->sample_period);
+	}
+
+	hwc->config = 0;
+
+	if (is_isa_arcv2()) {
+		/* "exclude user" means "count only kernel" */
+		if (event->attr.exclude_user)
+			hwc->config |= ARC_REG_PCT_CONFIG_KERN;
+
+		/* "exclude kernel" means "count only user" */
+		if (event->attr.exclude_kernel)
+			hwc->config |= ARC_REG_PCT_CONFIG_USER;
+	}
+
 	switch (event->attr.type) {
 	case PERF_TYPE_HARDWARE:
 		if (event->attr.config >= PERF_COUNT_HW_MAX)
 			return -ENOENT;
 		if (arc_pmu->ev_hw_idx[event->attr.config] < 0)
 			return -ENOENT;
-		hwc->config = arc_pmu->ev_hw_idx[event->attr.config];
+		hwc->config |= arc_pmu->ev_hw_idx[event->attr.config];
 		pr_debug("init event %d with h/w %d \'%s\'\n",
 			 (int) event->attr.config, (int) hwc->config,
 			 arc_pmu_ev_hw_map[event->attr.config]);
 		return 0;
+
 	case PERF_TYPE_HW_CACHE:
 		ret = arc_pmu_cache_event(event->attr.config);
 		if (ret < 0)
 			return ret;
-		hwc->config = arc_pmu->ev_hw_idx[ret];
+		hwc->config |= arc_pmu->ev_hw_idx[ret];
 		return 0;
 	default:
 		return -ENOENT;
@@ -180,6 +211,47 @@ static void arc_pmu_disable(struct pmu *pmu)
 	write_aux_reg(ARC_REG_PCT_CONTROL, (tmp & 0xffff0000) | 0x0);
 }
 
+static int arc_pmu_event_set_period(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	s64 left = local64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	int idx = hwc->idx;
+	int overflow = 0;
+	u64 value;
+
+	if (unlikely(left <= -period)) {
+		/* left underflowed by more than period. */
+		left = period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		overflow = 1;
+	} else	if (unlikely(left <= 0)) {
+		/* left underflowed by less than period. */
+		left += period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		overflow = 1;
+	}
+
+	if (left > arc_pmu->max_period)
+		left = arc_pmu->max_period;
+
+	value = arc_pmu->max_period - left;
+	local64_set(&hwc->prev_count, value);
+
+	/* Select counter */
+	write_aux_reg(ARC_REG_PCT_INDEX, idx);
+
+	/* Write value */
+	write_aux_reg(ARC_REG_PCT_COUNTL, (u32)value);
+	write_aux_reg(ARC_REG_PCT_COUNTH, (value >> 32));
+
+	perf_event_update_userpage(event);
+
+	return overflow;
+}
+
 /*
  * Assigns hardware counter to hardware condition.
  * Note that there is no separate start/stop mechanism;
@@ -194,13 +266,20 @@ static void arc_pmu_start(struct perf_event *event, int flags)
 		return;
 
 	if (flags & PERF_EF_RELOAD)
-		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+		WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+
+	hwc->state = 0;
 
-	event->hw.state = 0;
+	arc_pmu_event_set_period(event);
+
+	/* Enable interrupt for this counter */
+	if (is_sampling_event(event))
+		write_aux_reg(ARC_REG_PCT_INT_CTRL,
+			      read_aux_reg(ARC_REG_PCT_INT_CTRL) | (1 << idx));
 
 	/* enable ARC pmu here */
-	write_aux_reg(ARC_REG_PCT_INDEX, idx);
-	write_aux_reg(ARC_REG_PCT_CONFIG, hwc->config);
+	write_aux_reg(ARC_REG_PCT_INDEX, idx);		/* counter # */
+	write_aux_reg(ARC_REG_PCT_CONFIG, hwc->config);	/* condition */
 }
 
 static void arc_pmu_stop(struct perf_event *event, int flags)
@@ -208,6 +287,17 @@ static void arc_pmu_stop(struct perf_event *event, int flags)
 	struct hw_perf_event *hwc = &event->hw;
 	int idx = hwc->idx;
 
+	/* Disable interrupt for this counter */
+	if (is_sampling_event(event)) {
+		/*
+		 * Reset interrupt flag by writing of 1. This is required
+		 * to make sure pending interrupt was not left.
+		 */
+		write_aux_reg(ARC_REG_PCT_INT_ACT, 1 << idx);
+		write_aux_reg(ARC_REG_PCT_INT_CTRL,
+			      read_aux_reg(ARC_REG_PCT_INT_CTRL) & ~(1 << idx));
+	}
+
 	if (!(event->hw.state & PERF_HES_STOPPED)) {
 		/* stop ARC pmu here */
 		write_aux_reg(ARC_REG_PCT_INDEX, idx);
@@ -227,8 +317,12 @@ static void arc_pmu_stop(struct perf_event *event, int flags)
 
 static void arc_pmu_del(struct perf_event *event, int flags)
 {
+	struct arc_pmu_cpu *pmu_cpu = this_cpu_ptr(&arc_pmu_cpu);
+
 	arc_pmu_stop(event, PERF_EF_UPDATE);
-	__clear_bit(event->hw.idx, arc_pmu->used_mask);
+	__clear_bit(event->hw.idx, pmu_cpu->used_mask);
+
+	pmu_cpu->act_counter[event->hw.idx] = 0;
 
 	perf_event_update_userpage(event);
 }
@@ -236,20 +330,31 @@ static void arc_pmu_del(struct perf_event *event, int flags)
 /* allocate hardware counter and optionally start counting */
 static int arc_pmu_add(struct perf_event *event, int flags)
 {
+	struct arc_pmu_cpu *pmu_cpu = this_cpu_ptr(&arc_pmu_cpu);
 	struct hw_perf_event *hwc = &event->hw;
 	int idx = hwc->idx;
 
-	if (__test_and_set_bit(idx, arc_pmu->used_mask)) {
-		idx = find_first_zero_bit(arc_pmu->used_mask,
+	if (__test_and_set_bit(idx, pmu_cpu->used_mask)) {
+		idx = find_first_zero_bit(pmu_cpu->used_mask,
 					  arc_pmu->n_counters);
 		if (idx == arc_pmu->n_counters)
 			return -EAGAIN;
 
-		__set_bit(idx, arc_pmu->used_mask);
+		__set_bit(idx, pmu_cpu->used_mask);
 		hwc->idx = idx;
 	}
 
 	write_aux_reg(ARC_REG_PCT_INDEX, idx);
+
+	pmu_cpu->act_counter[idx] = event;
+
+	if (is_sampling_event(event)) {
+		/* Mimic full counter overflow as other arches do */
+		write_aux_reg(ARC_REG_PCT_INT_CNTL, (u32)arc_pmu->max_period);
+		write_aux_reg(ARC_REG_PCT_INT_CNTH,
+			      (arc_pmu->max_period >> 32));
+	}
+
 	write_aux_reg(ARC_REG_PCT_CONFIG, 0);
 	write_aux_reg(ARC_REG_PCT_COUNTL, 0);
 	write_aux_reg(ARC_REG_PCT_COUNTH, 0);
@@ -264,11 +369,82 @@ static int arc_pmu_add(struct perf_event *event, int flags)
 	return 0;
 }
 
+#ifdef CONFIG_ISA_ARCV2
+static irqreturn_t arc_pmu_intr(int irq, void *dev)
+{
+	struct perf_sample_data data;
+	struct arc_pmu_cpu *pmu_cpu = this_cpu_ptr(&arc_pmu_cpu);
+	struct pt_regs *regs;
+	int active_ints;
+	int idx;
+
+	arc_pmu_disable(&arc_pmu->pmu);
+
+	active_ints = read_aux_reg(ARC_REG_PCT_INT_ACT);
+
+	regs = get_irq_regs();
+
+	for (idx = 0; idx < arc_pmu->n_counters; idx++) {
+		struct perf_event *event = pmu_cpu->act_counter[idx];
+		struct hw_perf_event *hwc;
+
+		if (!(active_ints & (1 << idx)))
+			continue;
+
+		/* Reset interrupt flag by writing of 1 */
+		write_aux_reg(ARC_REG_PCT_INT_ACT, 1 << idx);
+
+		/*
+		 * On reset of "interrupt active" bit corresponding
+		 * "interrupt enable" bit gets automatically reset as well.
+		 * Now we need to re-enable interrupt for the counter.
+		 */
+		write_aux_reg(ARC_REG_PCT_INT_CTRL,
+			read_aux_reg(ARC_REG_PCT_INT_CTRL) | (1 << idx));
+
+		hwc = &event->hw;
+
+		WARN_ON_ONCE(hwc->idx != idx);
+
+		arc_perf_event_update(event, &event->hw, event->hw.idx);
+		perf_sample_data_init(&data, 0, hwc->last_period);
+		if (!arc_pmu_event_set_period(event))
+			continue;
+
+		if (perf_event_overflow(event, &data, regs))
+			arc_pmu_stop(event, 0);
+	}
+
+	arc_pmu_enable(&arc_pmu->pmu);
+
+	return IRQ_HANDLED;
+}
+#else
+
+static irqreturn_t arc_pmu_intr(int irq, void *dev)
+{
+	return IRQ_NONE;
+}
+
+#endif /* CONFIG_ISA_ARCV2 */
+
+void arc_cpu_pmu_irq_init(void)
+{
+	struct arc_pmu_cpu *pmu_cpu = this_cpu_ptr(&arc_pmu_cpu);
+
+	arc_request_percpu_irq(arc_pmu->irq, smp_processor_id(), arc_pmu_intr,
+			       "ARC perf counters", pmu_cpu);
+
+	/* Clear all pending interrupt flags */
+	write_aux_reg(ARC_REG_PCT_INT_ACT, 0xffffffff);
+}
+
 static int arc_pmu_device_probe(struct platform_device *pdev)
 {
 	struct arc_reg_pct_build pct_bcr;
 	struct arc_reg_cc_build cc_bcr;
-	int i, j;
+	int i, j, has_interrupts;
+	int counter_size;	/* in bits */
 
 	union cc_name {
 		struct {
@@ -284,7 +460,7 @@ static int arc_pmu_device_probe(struct platform_device *pdev)
 		pr_err("This core does not have performance counters!\n");
 		return -ENODEV;
 	}
-	BUG_ON(pct_bcr.c > ARC_PMU_MAX_HWEVENTS);
+	BUG_ON(pct_bcr.c > ARC_PERF_MAX_COUNTERS);
 
 	READ_BCR(ARC_REG_CC_BUILD, cc_bcr);
 	BUG_ON(!cc_bcr.v); /* Counters exist but No countable conditions ? */
@@ -293,11 +469,16 @@ static int arc_pmu_device_probe(struct platform_device *pdev)
 	if (!arc_pmu)
 		return -ENOMEM;
 
+	has_interrupts = is_isa_arcv2() ? pct_bcr.i : 0;
+
 	arc_pmu->n_counters = pct_bcr.c;
-	arc_pmu->counter_size = 32 + (pct_bcr.s << 4);
+	counter_size = 32 + (pct_bcr.s << 4);
 
-	pr_info("ARC perf\t: %d counters (%d bits), %d countable conditions\n",
-		arc_pmu->n_counters, arc_pmu->counter_size, cc_bcr.c);
+	arc_pmu->max_period = (1ULL << counter_size) / 2 - 1ULL;
+
+	pr_info("ARC perf\t: %d counters (%d bits), %d conditions%s\n",
+		arc_pmu->n_counters, counter_size, cc_bcr.c,
+		has_interrupts ? ", [overflow IRQ support]":"");
 
 	cc_name.str[8] = 0;
 	for (i = 0; i < PERF_COUNT_ARC_HW_MAX; i++)
@@ -332,8 +513,37 @@ static int arc_pmu_device_probe(struct platform_device *pdev)
 		.read		= arc_pmu_read,
 	};
 
-	/* ARC 700 PMU does not support sampling events */
-	arc_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+	if (has_interrupts) {
+		int irq = platform_get_irq(pdev, 0);
+		unsigned long flags;
+
+		if (irq < 0) {
+			pr_err("Cannot get IRQ number for the platform\n");
+			return -ENODEV;
+		}
+
+		arc_pmu->irq = irq;
+
+		/*
+		 * arc_cpu_pmu_irq_init() needs to be called on all cores for
+		 * their respective local PMU.
+		 * However we use opencoded on_each_cpu() to ensure it is called
+		 * on core0 first, so that arc_request_percpu_irq() sets up
+		 * AUTOEN etc. Otherwise enable_percpu_irq() fails to enable
+		 * perf IRQ on non master cores.
+		 * see arc_request_percpu_irq()
+		 */
+		preempt_disable();
+		local_irq_save(flags);
+		arc_cpu_pmu_irq_init();
+		local_irq_restore(flags);
+		smp_call_function((smp_call_func_t)arc_cpu_pmu_irq_init, 0, 1);
+		preempt_enable();
+
+		/* Clean all pending interrupt flags */
+		write_aux_reg(ARC_REG_PCT_INT_ACT, 0xffffffff);
+	} else
+		arc_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
 
 	return perf_pmu_register(&arc_pmu->pmu, pdev->name, PERF_TYPE_RAW);
 }
@@ -341,6 +551,7 @@ static int arc_pmu_device_probe(struct platform_device *pdev)
 #ifdef CONFIG_OF
 static const struct of_device_id arc_pmu_match[] = {
 	{ .compatible = "snps,arc700-pct" },
+	{ .compatible = "snps,archs-pct" },
 	{},
 };
 MODULE_DEVICE_TABLE(of, arc_pmu_match);
@@ -348,7 +559,7 @@ MODULE_DEVICE_TABLE(of, arc_pmu_match);
 
 static struct platform_driver arc_pmu_driver = {
 	.driver	= {
-		.name		= "arc700-pct",
+		.name		= "arc-pct",
 		.of_match_table = of_match_ptr(arc_pmu_match),
 	},
 	.probe		= arc_pmu_device_probe,
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index 44092456776f..91d5a0f1f3f7 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -65,7 +65,7 @@ asmlinkage void ret_from_fork(void);
  * ------------------
  * |     r25        |   <==== top of Stack (thread.ksp)
  * ~                ~
- * |    --to--      |   (CALLEE Regs of user mode)
+ * |    --to--      |   (CALLEE Regs of kernel mode)
  * |     r13        |
  * ------------------
  * |     fp         |
diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c
index a3d186211ed3..cabde9dc0696 100644
--- a/arch/arc/kernel/setup.c
+++ b/arch/arc/kernel/setup.c
@@ -47,6 +47,7 @@ static void read_arc_build_cfg_regs(void)
 	struct bcr_perip uncached_space;
 	struct bcr_generic bcr;
 	struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()];
+	unsigned long perip_space;
 	FIX_PTR(cpu);
 
 	READ_BCR(AUX_IDENTITY, cpu->core);
@@ -56,7 +57,12 @@ static void read_arc_build_cfg_regs(void)
 	cpu->vec_base = read_aux_reg(AUX_INTR_VEC_BASE);
 
 	READ_BCR(ARC_REG_D_UNCACH_BCR, uncached_space);
-	BUG_ON((uncached_space.start << 24) != ARC_UNCACHED_ADDR_SPACE);
+        if (uncached_space.ver < 3)
+		perip_space = uncached_space.start << 24;
+	else
+		perip_space = read_aux_reg(AUX_NON_VOL) & 0xF0000000;
+
+	BUG_ON(perip_space != ARC_UNCACHED_ADDR_SPACE);
 
 	READ_BCR(ARC_REG_MUL_BCR, cpu->extn_mpy);
 
@@ -142,17 +148,22 @@ static void read_arc_build_cfg_regs(void)
 }
 
 static const struct cpuinfo_data arc_cpu_tbl[] = {
+#ifdef CONFIG_ISA_ARCOMPACT
 	{ {0x20, "ARC 600"      }, 0x2F},
 	{ {0x30, "ARC 700"      }, 0x33},
 	{ {0x34, "ARC 700 R4.10"}, 0x34},
 	{ {0x35, "ARC 700 R4.11"}, 0x35},
-	{ {0x50, "ARC HS38"	}, 0x51},
+#else
+	{ {0x50, "ARC HS38 R2.0"}, 0x51},
+	{ {0x52, "ARC HS38 R2.1"}, 0x52},
+#endif
 	{ {0x00, NULL		} }
 };
 
-#define IS_AVAIL1(v, str)	((v) ? str : "")
-#define IS_USED(cfg)		(IS_ENABLED(cfg) ? "" : "(not used) ")
-#define IS_AVAIL2(v, str, cfg)  IS_AVAIL1(v, str), IS_AVAIL1(v, IS_USED(cfg))
+#define IS_AVAIL1(v, s)		((v) ? s : "")
+#define IS_USED_RUN(v)		((v) ? "" : "(not used) ")
+#define IS_USED_CFG(cfg)	IS_USED_RUN(IS_ENABLED(cfg))
+#define IS_AVAIL2(v, s, cfg)	IS_AVAIL1(v, s), IS_AVAIL1(v, IS_USED_CFG(cfg))
 
 static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len)
 {
@@ -226,7 +237,7 @@ static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len)
 			n += scnprintf(buf + n, len - n, "mpy[opt %d] ", opt);
 		}
 		n += scnprintf(buf + n, len - n, "%s",
-			       IS_USED(CONFIG_ARC_HAS_HW_MPY));
+			       IS_USED_CFG(CONFIG_ARC_HAS_HW_MPY));
 	}
 
 	n += scnprintf(buf + n, len - n, "%s%s%s%s%s%s%s%s\n",
@@ -325,6 +336,10 @@ static void arc_chk_core_config(void)
 		pr_warn("CONFIG_ARC_FPU_SAVE_RESTORE needed for working apps\n");
 	else if (!cpu->extn.fpu_dp && fpu_enabled)
 		panic("FPU non-existent, disable CONFIG_ARC_FPU_SAVE_RESTORE\n");
+
+	if (is_isa_arcv2() && IS_ENABLED(CONFIG_SMP) && cpu->isa.atomic &&
+	    !IS_ENABLED(CONFIG_ARC_STAR_9000923308))
+		panic("llock/scond livelock workaround missing\n");
 }
 
 /*
diff --git a/arch/arc/kernel/time.c b/arch/arc/kernel/time.c
index 3364d2bbc515..4294761a2b3e 100644
--- a/arch/arc/kernel/time.c
+++ b/arch/arc/kernel/time.c
@@ -203,34 +203,24 @@ static int arc_clkevent_set_next_event(unsigned long delta,
 	return 0;
 }
 
-static void arc_clkevent_set_mode(enum clock_event_mode mode,
-				  struct clock_event_device *dev)
+static int arc_clkevent_set_periodic(struct clock_event_device *dev)
 {
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-                /*
-                 * At X Hz, 1 sec = 1000ms -> X cycles;
-                 *                    10ms -> X / 100 cycles
-                 */
-		arc_timer_event_setup(arc_get_core_freq() / HZ);
-		break;
-	case CLOCK_EVT_MODE_ONESHOT:
-		break;
-	default:
-		break;
-	}
-
-	return;
+	/*
+	 * At X Hz, 1 sec = 1000ms -> X cycles;
+	 *		      10ms -> X / 100 cycles
+	 */
+	arc_timer_event_setup(arc_get_core_freq() / HZ);
+	return 0;
 }
 
 static DEFINE_PER_CPU(struct clock_event_device, arc_clockevent_device) = {
-	.name		= "ARC Timer0",
-	.features	= CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC,
-	.mode		= CLOCK_EVT_MODE_UNUSED,
-	.rating		= 300,
-	.irq		= TIMER0_IRQ,	/* hardwired, no need for resources */
-	.set_next_event = arc_clkevent_set_next_event,
-	.set_mode	= arc_clkevent_set_mode,
+	.name			= "ARC Timer0",
+	.features		= CLOCK_EVT_FEAT_ONESHOT |
+				  CLOCK_EVT_FEAT_PERIODIC,
+	.rating			= 300,
+	.irq			= TIMER0_IRQ,	/* hardwired, no need for resources */
+	.set_next_event		= arc_clkevent_set_next_event,
+	.set_state_periodic	= arc_clkevent_set_periodic,
 };
 
 static irqreturn_t timer_irq_handler(int irq, void *dev_id)
@@ -240,7 +230,7 @@ static irqreturn_t timer_irq_handler(int irq, void *dev_id)
 	 * irq_set_chip_and_handler() asked for handle_percpu_devid_irq()
 	 */
 	struct clock_event_device *evt = this_cpu_ptr(&arc_clockevent_device);
-	int irq_reenable = evt->mode == CLOCK_EVT_MODE_PERIODIC;
+	int irq_reenable = clockevent_state_periodic(evt);
 
 	/*
 	 * Any write to CTRL reg ACks the interrupt, we rewrite the
diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index 807f7d61d7a7..a6f91e88ce36 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -58,7 +58,6 @@ static void show_callee_regs(struct callee_regs *cregs)
 
 static void print_task_path_n_nm(struct task_struct *tsk, char *buf)
 {
-	struct path path;
 	char *path_nm = NULL;
 	struct mm_struct *mm;
 	struct file *exe_file;
diff --git a/arch/arc/kernel/unaligned.c b/arch/arc/kernel/unaligned.c
index 74db59b6f392..abd961f3e763 100644
--- a/arch/arc/kernel/unaligned.c
+++ b/arch/arc/kernel/unaligned.c
@@ -34,7 +34,7 @@
 	"	.section .fixup,\"ax\"\n"		\
 	"	.align	4\n"				\
 	"3:	mov	%0, 1\n"			\
-	"	b	2b\n"				\
+	"	j	2b\n"				\
 	"	.previous\n"				\
 	"	.section __ex_table,\"a\"\n"		\
 	"	.align	4\n"				\
@@ -82,7 +82,7 @@
 		"	.section .fixup,\"ax\"\n"	\
 		"	.align	4\n"			\
 		"4:	mov	%0, 1\n"		\
-		"	b	3b\n"			\
+		"	j	3b\n"			\
 		"	.previous\n"			\
 		"	.section __ex_table,\"a\"\n"	\
 		"	.align	4\n"			\
@@ -113,7 +113,7 @@
 		"	.section .fixup,\"ax\"\n"	\
 		"	.align	4\n"			\
 		"6:	mov	%0, 1\n"		\
-		"	b	5b\n"			\
+		"	j	5b\n"			\
 		"	.previous\n"			\
 		"	.section __ex_table,\"a\"\n"	\
 		"	.align	4\n"			\
diff --git a/arch/arc/lib/memcpy-archs.S b/arch/arc/lib/memcpy-archs.S
index 1b2b3acfed52..0cab0b8a57c5 100644
--- a/arch/arc/lib/memcpy-archs.S
+++ b/arch/arc/lib/memcpy-archs.S
@@ -206,7 +206,7 @@ unalignedOffby3:
 	ld.ab	r6, [r1, 4]
 	prefetch [r1, 28]	;Prefetch the next read location
 	ld.ab	r8, [r1,4]
-	prefetch [r3, 32]	;Prefetch the next write location
+	prefetchw [r3, 32]	;Prefetch the next write location
 
 	SHIFT_1	(r7, r6, 8)
 	or	r7, r7, r5
diff --git a/arch/arc/lib/memset-archs.S b/arch/arc/lib/memset-archs.S
index 92d573c734b5..365b18364815 100644
--- a/arch/arc/lib/memset-archs.S
+++ b/arch/arc/lib/memset-archs.S
@@ -10,12 +10,6 @@
 
 #undef PREALLOC_NOT_AVAIL
 
-#ifdef PREALLOC_NOT_AVAIL
-#define PREWRITE(A,B)	prefetchw [(A),(B)]
-#else
-#define PREWRITE(A,B)	prealloc [(A),(B)]
-#endif
-
 ENTRY(memset)
 	prefetchw [r0]		; Prefetch the write location
 	mov.f	0, r2
@@ -51,9 +45,15 @@ ENTRY(memset)
 
 ;;; Convert len to Dwords, unfold x8
 	lsr.f	lp_count, lp_count, 6
+
 	lpnz	@.Lset64bytes
 	;; LOOP START
-	PREWRITE(r3, 64)	;Prefetch the next write location
+#ifdef PREALLOC_NOT_AVAIL
+	prefetchw [r3, 64]	;Prefetch the next write location
+#else
+	prealloc  [r3, 64]
+#endif
+#ifdef CONFIG_ARC_HAS_LL64
 	std.ab	r4, [r3, 8]
 	std.ab	r4, [r3, 8]
 	std.ab	r4, [r3, 8]
@@ -62,16 +62,45 @@ ENTRY(memset)
 	std.ab	r4, [r3, 8]
 	std.ab	r4, [r3, 8]
 	std.ab	r4, [r3, 8]
+#else
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+#endif
 .Lset64bytes:
 
 	lsr.f	lp_count, r2, 5 ;Last remaining  max 124 bytes
 	lpnz	.Lset32bytes
 	;; LOOP START
 	prefetchw   [r3, 32]	;Prefetch the next write location
+#ifdef CONFIG_ARC_HAS_LL64
 	std.ab	r4, [r3, 8]
 	std.ab	r4, [r3, 8]
 	std.ab	r4, [r3, 8]
 	std.ab	r4, [r3, 8]
+#else
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+#endif
 .Lset32bytes:
 
 	and.f	lp_count, r2, 0x1F ;Last remaining 31 bytes
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index b29d62ed4f7e..0d1a6e96839f 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -22,15 +22,22 @@
 #include <asm/setup.h>
 
 static int l2_line_sz;
+int ioc_exists;
+volatile int slc_enable = 1, ioc_enable = 1;
 
 void (*_cache_line_loop_ic_fn)(unsigned long paddr, unsigned long vaddr,
 			       unsigned long sz, const int cacheop);
 
+void (*__dma_cache_wback_inv)(unsigned long start, unsigned long sz);
+void (*__dma_cache_inv)(unsigned long start, unsigned long sz);
+void (*__dma_cache_wback)(unsigned long start, unsigned long sz);
+
 char *arc_cache_mumbojumbo(int c, char *buf, int len)
 {
 	int n = 0;
 	struct cpuinfo_arc_cache *p;
 
+#define IS_USED_RUN(v)		((v) ? "" : "(disabled) ")
 #define PR_CACHE(p, cfg, str)						\
 	if (!(p)->ver)							\
 		n += scnprintf(buf + n, len - n, str"\t\t: N/A\n");	\
@@ -45,10 +52,18 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len)
 	PR_CACHE(&cpuinfo_arc700[c].icache, CONFIG_ARC_HAS_ICACHE, "I-Cache");
 	PR_CACHE(&cpuinfo_arc700[c].dcache, CONFIG_ARC_HAS_DCACHE, "D-Cache");
 
+	if (!is_isa_arcv2())
+                return buf;
+
 	p = &cpuinfo_arc700[c].slc;
 	if (p->ver)
 		n += scnprintf(buf + n, len - n,
-			"SLC\t\t: %uK, %uB Line\n", p->sz_k, p->line_len);
+			       "SLC\t\t: %uK, %uB Line%s\n",
+			       p->sz_k, p->line_len, IS_USED_RUN(slc_enable));
+
+	if (ioc_exists)
+		n += scnprintf(buf + n, len - n, "IOC\t\t:%s\n",
+				IS_USED_RUN(ioc_enable));
 
 	return buf;
 }
@@ -58,18 +73,9 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len)
  * the cpuinfo structure for later use.
  * No Validation done here, simply read/convert the BCRs
  */
-void read_decode_cache_bcr(void)
+static void read_decode_cache_bcr_arcv2(int cpu)
 {
-	struct cpuinfo_arc_cache *p_ic, *p_dc, *p_slc;
-	unsigned int cpu = smp_processor_id();
-	struct bcr_cache {
-#ifdef CONFIG_CPU_BIG_ENDIAN
-		unsigned int pad:12, line_len:4, sz:4, config:4, ver:8;
-#else
-		unsigned int ver:8, config:4, sz:4, line_len:4, pad:12;
-#endif
-	} ibcr, dbcr;
-
+	struct cpuinfo_arc_cache *p_slc = &cpuinfo_arc700[cpu].slc;
 	struct bcr_generic sbcr;
 
 	struct bcr_slc_cfg {
@@ -80,6 +86,39 @@ void read_decode_cache_bcr(void)
 #endif
 	} slc_cfg;
 
+	struct bcr_clust_cfg {
+#ifdef CONFIG_CPU_BIG_ENDIAN
+		unsigned int pad:7, c:1, num_entries:8, num_cores:8, ver:8;
+#else
+		unsigned int ver:8, num_cores:8, num_entries:8, c:1, pad:7;
+#endif
+	} cbcr;
+
+	READ_BCR(ARC_REG_SLC_BCR, sbcr);
+	if (sbcr.ver) {
+		READ_BCR(ARC_REG_SLC_CFG, slc_cfg);
+		p_slc->ver = sbcr.ver;
+		p_slc->sz_k = 128 << slc_cfg.sz;
+		l2_line_sz = p_slc->line_len = (slc_cfg.lsz == 0) ? 128 : 64;
+	}
+
+	READ_BCR(ARC_REG_CLUSTER_BCR, cbcr);
+	if (cbcr.c && ioc_enable)
+		ioc_exists = 1;
+}
+
+void read_decode_cache_bcr(void)
+{
+	struct cpuinfo_arc_cache *p_ic, *p_dc;
+	unsigned int cpu = smp_processor_id();
+	struct bcr_cache {
+#ifdef CONFIG_CPU_BIG_ENDIAN
+		unsigned int pad:12, line_len:4, sz:4, config:4, ver:8;
+#else
+		unsigned int ver:8, config:4, sz:4, line_len:4, pad:12;
+#endif
+	} ibcr, dbcr;
+
 	p_ic = &cpuinfo_arc700[cpu].icache;
 	READ_BCR(ARC_REG_IC_BCR, ibcr);
 
@@ -122,17 +161,8 @@ dc_chk:
 	p_dc->ver = dbcr.ver;
 
 slc_chk:
-	if (!is_isa_arcv2())
-		return;
-
-	p_slc = &cpuinfo_arc700[cpu].slc;
-	READ_BCR(ARC_REG_SLC_BCR, sbcr);
-	if (sbcr.ver) {
-		READ_BCR(ARC_REG_SLC_CFG, slc_cfg);
-		p_slc->ver = sbcr.ver;
-		p_slc->sz_k = 128 << slc_cfg.sz;
-		l2_line_sz = p_slc->line_len = (slc_cfg.lsz == 0) ? 128 : 64;
-	}
+	if (is_isa_arcv2())
+                read_decode_cache_bcr_arcv2(cpu);
 }
 
 /*
@@ -468,10 +498,18 @@ static void __ic_line_inv_vaddr(unsigned long paddr, unsigned long vaddr,
 noinline void slc_op(unsigned long paddr, unsigned long sz, const int op)
 {
 #ifdef CONFIG_ISA_ARCV2
+	/*
+	 * SLC is shared between all cores and concurrent aux operations from
+	 * multiple cores need to be serialized using a spinlock
+	 * A concurrent operation can be silently ignored and/or the old/new
+	 * operation can remain incomplete forever (lockup in SLC_CTRL_BUSY loop
+	 * below)
+	 */
+	static DEFINE_SPINLOCK(lock);
 	unsigned long flags;
 	unsigned int ctrl;
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&lock, flags);
 
 	/*
 	 * The Region Flush operation is specified by CTRL.RGN_OP[11..9]
@@ -504,15 +542,10 @@ noinline void slc_op(unsigned long paddr, unsigned long sz, const int op)
 
 	while (read_aux_reg(ARC_REG_SLC_CTRL) & SLC_CTRL_BUSY);
 
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&lock, flags);
 #endif
 }
 
-static inline int need_slc_flush(void)
-{
-	return is_isa_arcv2() && l2_line_sz;
-}
-
 /***********************************************************
  * Exported APIs
  */
@@ -561,30 +594,74 @@ void flush_dcache_page(struct page *page)
 }
 EXPORT_SYMBOL(flush_dcache_page);
 
-void dma_cache_wback_inv(unsigned long start, unsigned long sz)
+/*
+ * DMA ops for systems with L1 cache only
+ * Make memory coherent with L1 cache by flushing/invalidating L1 lines
+ */
+static void __dma_cache_wback_inv_l1(unsigned long start, unsigned long sz)
 {
 	__dc_line_op_k(start, sz, OP_FLUSH_N_INV);
+}
 
-	if (need_slc_flush())
-		slc_op(start, sz, OP_FLUSH_N_INV);
+static void __dma_cache_inv_l1(unsigned long start, unsigned long sz)
+{
+	__dc_line_op_k(start, sz, OP_INV);
 }
-EXPORT_SYMBOL(dma_cache_wback_inv);
 
-void dma_cache_inv(unsigned long start, unsigned long sz)
+static void __dma_cache_wback_l1(unsigned long start, unsigned long sz)
+{
+	__dc_line_op_k(start, sz, OP_FLUSH);
+}
+
+/*
+ * DMA ops for systems with both L1 and L2 caches, but without IOC
+ * Both L1 and L2 lines need to be explicity flushed/invalidated
+ */
+static void __dma_cache_wback_inv_slc(unsigned long start, unsigned long sz)
+{
+	__dc_line_op_k(start, sz, OP_FLUSH_N_INV);
+	slc_op(start, sz, OP_FLUSH_N_INV);
+}
+
+static void __dma_cache_inv_slc(unsigned long start, unsigned long sz)
 {
 	__dc_line_op_k(start, sz, OP_INV);
+	slc_op(start, sz, OP_INV);
+}
+
+static void __dma_cache_wback_slc(unsigned long start, unsigned long sz)
+{
+	__dc_line_op_k(start, sz, OP_FLUSH);
+	slc_op(start, sz, OP_FLUSH);
+}
 
-	if (need_slc_flush())
-		slc_op(start, sz, OP_INV);
+/*
+ * DMA ops for systems with IOC
+ * IOC hardware snoops all DMA traffic keeping the caches consistent with
+ * memory - eliding need for any explicit cache maintenance of DMA buffers
+ */
+static void __dma_cache_wback_inv_ioc(unsigned long start, unsigned long sz) {}
+static void __dma_cache_inv_ioc(unsigned long start, unsigned long sz) {}
+static void __dma_cache_wback_ioc(unsigned long start, unsigned long sz) {}
+
+/*
+ * Exported DMA API
+ */
+void dma_cache_wback_inv(unsigned long start, unsigned long sz)
+{
+	__dma_cache_wback_inv(start, sz);
+}
+EXPORT_SYMBOL(dma_cache_wback_inv);
+
+void dma_cache_inv(unsigned long start, unsigned long sz)
+{
+	__dma_cache_inv(start, sz);
 }
 EXPORT_SYMBOL(dma_cache_inv);
 
 void dma_cache_wback(unsigned long start, unsigned long sz)
 {
-	__dc_line_op_k(start, sz, OP_FLUSH);
-
-	if (need_slc_flush())
-		slc_op(start, sz, OP_FLUSH);
+	__dma_cache_wback(start, sz);
 }
 EXPORT_SYMBOL(dma_cache_wback);
 
@@ -840,4 +917,41 @@ void arc_cache_init(void)
 				panic("Disable CONFIG_ARC_CACHE_VIPT_ALIASING\n");
 		}
 	}
+
+	if (is_isa_arcv2() && l2_line_sz && !slc_enable) {
+
+		/* IM set : flush before invalidate */
+		write_aux_reg(ARC_REG_SLC_CTRL,
+			read_aux_reg(ARC_REG_SLC_CTRL) | SLC_CTRL_IM);
+
+		write_aux_reg(ARC_REG_SLC_INVALIDATE, 1);
+
+		/* Important to wait for flush to complete */
+		while (read_aux_reg(ARC_REG_SLC_CTRL) & SLC_CTRL_BUSY);
+		write_aux_reg(ARC_REG_SLC_CTRL,
+			read_aux_reg(ARC_REG_SLC_CTRL) | SLC_CTRL_DISABLE);
+	}
+
+	if (is_isa_arcv2() && ioc_exists) {
+		/* IO coherency base - 0x8z */
+		write_aux_reg(ARC_REG_IO_COH_AP0_BASE, 0x80000);
+		/* IO coherency aperture size - 512Mb: 0x8z-0xAz */
+		write_aux_reg(ARC_REG_IO_COH_AP0_SIZE, 0x11);
+		/* Enable partial writes */
+		write_aux_reg(ARC_REG_IO_COH_PARTIAL, 1);
+		/* Enable IO coherency */
+		write_aux_reg(ARC_REG_IO_COH_ENABLE, 1);
+
+		__dma_cache_wback_inv = __dma_cache_wback_inv_ioc;
+		__dma_cache_inv = __dma_cache_inv_ioc;
+		__dma_cache_wback = __dma_cache_wback_ioc;
+	} else if (is_isa_arcv2() && l2_line_sz && slc_enable) {
+		__dma_cache_wback_inv = __dma_cache_wback_inv_slc;
+		__dma_cache_inv = __dma_cache_inv_slc;
+		__dma_cache_wback = __dma_cache_wback_slc;
+	} else {
+		__dma_cache_wback_inv = __dma_cache_wback_inv_l1;
+		__dma_cache_inv = __dma_cache_inv_l1;
+		__dma_cache_wback = __dma_cache_wback_l1;
+	}
 }
diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index 74a637a1cfc4..29a46bb198cc 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -19,6 +19,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/dma-debug.h>
 #include <linux/export.h>
+#include <asm/cache.h>
 #include <asm/cacheflush.h>
 
 /*
@@ -53,6 +54,20 @@ void *dma_alloc_coherent(struct device *dev, size_t size,
 {
 	void *paddr, *kvaddr;
 
+	/*
+	 * IOC relies on all data (even coherent DMA data) being in cache
+	 * Thus allocate normal cached memory
+	 *
+	 * The gains with IOC are two pronged:
+	 *   -For streaming data, elides needs for cache maintenance, saving
+	 *    cycles in flush code, and bus bandwidth as all the lines of a
+	 *    buffer need to be flushed out to memory
+	 *   -For coherent data, Read/Write to buffers terminate early in cache
+	 *   (vs. always going to memory - thus are faster)
+	 */
+	if (is_isa_arcv2() && ioc_exists)
+		return dma_alloc_noncoherent(dev, size, dma_handle, gfp);
+
 	/* This is linear addr (0x8000_0000 based) */
 	paddr = alloc_pages_exact(size, gfp);
 	if (!paddr)
@@ -60,8 +75,8 @@ void *dma_alloc_coherent(struct device *dev, size_t size,
 
 	/* This is kernel Virtual address (0x7000_0000 based) */
 	kvaddr = ioremap_nocache((unsigned long)paddr, size);
-	if (kvaddr != NULL)
-		memset(kvaddr, 0, size);
+	if (kvaddr == NULL)
+		return NULL;
 
 	/* This is bus address, platform dependent */
 	*dma_handle = (dma_addr_t)paddr;
@@ -85,6 +100,9 @@ EXPORT_SYMBOL(dma_alloc_coherent);
 void dma_free_coherent(struct device *dev, size_t size, void *kvaddr,
 		       dma_addr_t dma_handle)
 {
+	if (is_isa_arcv2() && ioc_exists)
+		return dma_free_noncoherent(dev, size, kvaddr, dma_handle);
+
 	iounmap((void __force __iomem *)kvaddr);
 
 	free_pages_exact((void *)dma_handle, size);
diff --git a/arch/arc/plat-axs10x/axs10x.c b/arch/arc/plat-axs10x/axs10x.c
index 99f7da513a48..ad9825d4026a 100644
--- a/arch/arc/plat-axs10x/axs10x.c
+++ b/arch/arc/plat-axs10x/axs10x.c
@@ -46,7 +46,7 @@ static void __init axs10x_enable_gpio_intc_wire(void)
 	 * -------------------   -------------------
 	 * | snps,dw-apb-gpio |  | snps,dw-apb-gpio |
 	 * -------------------   -------------------
-	 *        |                         |
+	 *        | #12                     |
 	 *        |                 [ Debug UART on cpu card ]
 	 *        |
 	 * ------------------------
@@ -389,6 +389,21 @@ axs103_set_freq(unsigned int id, unsigned int fd, unsigned int od)
 
 static void __init axs103_early_init(void)
 {
+	/*
+	 * AXS103 configurations for SMP/QUAD configurations share device tree
+	 * which defaults to 90 MHz. However recent failures of Quad config
+	 * revealed P&R timing violations so clamp it down to safe 50 MHz
+	 * Instead of duplicating defconfig/DT for SMP/QUAD, add a small hack
+	 *
+	 * This hack is really hacky as of now. Fix it properly by getting the
+	 * number of cores as return value of platform's early SMP callback
+	 */
+#ifdef CONFIG_ARC_MCIP
+	unsigned int num_cores = (read_aux_reg(ARC_REG_MCIP_BCR) >> 16) & 0x3F;
+	if (num_cores > 2)
+		arc_set_core_freq(50 * 1000000);
+#endif
+
 	switch (arc_get_core_freq()/1000000) {
 	case 33:
 		axs103_set_freq(1, 1, 1);