summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/Kconfig.cpu1
-rw-r--r--arch/x86/Makefile6
-rw-r--r--arch/x86/ia32/sys_ia32.c19
-rw-r--r--arch/x86/include/asm/desc.h2
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/lguest_hcall.h2
-rw-r--r--arch/x86/include/asm/mce.h1
-rw-r--r--arch/x86/include/asm/pat.h4
-rw-r--r--arch/x86/include/asm/processor.h6
-rw-r--r--arch/x86/include/asm/required-features.h2
-rw-r--r--arch/x86/include/asm/tlbflush.h2
-rw-r--r--arch/x86/include/asm/topology.h2
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h5
-rw-r--r--arch/x86/include/asm/xen/page.h3
-rw-r--r--arch/x86/kernel/amd_iommu_init.c16
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/apic/nmi.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c31
-rw-r--r--arch/x86/kernel/bios_uv.c3
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c38
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c33
-rw-r--r--arch/x86/kernel/cpu/proc.c2
-rw-r--r--arch/x86/kernel/entry_64.S3
-rw-r--r--arch/x86/kernel/hpet.c24
-rw-r--r--arch/x86/kernel/i8253.c2
-rw-r--r--arch/x86/kernel/kvmclock.c7
-rw-r--r--arch/x86/kernel/microcode_core.c35
-rw-r--r--arch/x86/kernel/pci-swiotlb.c2
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/tlb_uv.c189
-rw-r--r--arch/x86/kernel/tsc.c2
-rw-r--r--arch/x86/kernel/uv_sysfs.c4
-rw-r--r--arch/x86/kernel/uv_time.c10
-rw-r--r--arch/x86/kernel/vmiclock_32.c2
-rw-r--r--arch/x86/kernel/xsave.c4
-rw-r--r--arch/x86/kvm/mmu.c2
-rw-r--r--arch/x86/kvm/x86.c8
-rw-r--r--arch/x86/lguest/boot.c18
-rw-r--r--arch/x86/mm/ioremap.c10
-rw-r--r--arch/x86/mm/kmmio.c2
-rw-r--r--arch/x86/mm/numa_32.c2
-rw-r--r--arch/x86/mm/numa_64.c3
-rw-r--r--arch/x86/mm/pageattr.c127
-rw-r--r--arch/x86/mm/pat.c189
-rw-r--r--arch/x86/mm/srat_64.c6
-rw-r--r--arch/x86/pci/amd_bus.c6
-rw-r--r--arch/x86/pci/common.c5
-rw-r--r--arch/x86/pci/i386.c4
-rw-r--r--arch/x86/pci/mmconfig-shared.c6
-rw-r--r--arch/x86/vdso/vclock_gettime.c12
-rw-r--r--arch/x86/xen/enlighten.c89
-rw-r--r--arch/x86/xen/mmu.c116
-rw-r--r--arch/x86/xen/mmu.h3
-rw-r--r--arch/x86/xen/smp.c4
-rw-r--r--arch/x86/xen/time.c7
-rw-r--r--arch/x86/xen/xen-ops.h2
58 files changed, 632 insertions, 467 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bc25b9f5e4cd..df9e885eee14 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -277,6 +277,7 @@ config SPARSE_IRQ
config NUMA_MIGRATE_IRQ_DESC
bool "Move irq desc when changing irq smp_affinity"
depends on SPARSE_IRQ && NUMA
+ depends on BROKEN
default n
---help---
This enables moving irq_desc to cpu/node that irq will use handled.
@@ -353,6 +354,7 @@ config X86_UV
bool "SGI Ultraviolet"
depends on X86_64
depends on X86_EXTENDED_PLATFORM
+ depends on NUMA
select X86_X2APIC
---help---
This option is needed in order to support SGI Ultraviolet systems.
@@ -663,6 +665,7 @@ config MAXSMP
config NR_CPUS
int "Maximum number of CPUs" if SMP && !MAXSMP
+ range 2 8 if SMP && X86_32 && !X86_BIGSMP
range 2 512 if SMP && !MAXSMP
default "1" if !SMP
default "4096" if MAXSMP
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 924e156a85ab..8130334329c0 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -506,6 +506,7 @@ config X86_PTRACE_BTS
bool "Branch Trace Store"
default y
depends on X86_DEBUGCTLMSR
+ depends on BROKEN
---help---
This adds a ptrace interface to the hardware's branch trace store.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index f05d8c91d9e5..8c86b72afdc2 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -153,7 +153,7 @@ endif
boot := arch/x86/boot
-BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage install
+BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage
PHONY += bzImage $(BOOT_TARGETS)
@@ -171,6 +171,10 @@ bzImage: vmlinux
$(BOOT_TARGETS): vmlinux
$(Q)$(MAKE) $(build)=$(boot) $@
+PHONY += install
+install:
+ $(Q)$(MAKE) $(build)=$(boot) $@
+
PHONY += vdso_install
vdso_install:
$(Q)$(MAKE) $(build)=arch/x86/vdso $@
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index efac92fd1efb..085a8c35f149 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -129,21 +129,12 @@ asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename,
struct stat64 __user *statbuf, int flag)
{
struct kstat stat;
- int error = -EINVAL;
+ int error;
- if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
- goto out;
-
- if (flag & AT_SYMLINK_NOFOLLOW)
- error = vfs_lstat_fd(dfd, filename, &stat);
- else
- error = vfs_stat_fd(dfd, filename, &stat);
-
- if (!error)
- error = cp_stat64(statbuf, &stat);
-
-out:
- return error;
+ error = vfs_fstatat(dfd, filename, &stat, flag);
+ if (error)
+ return error;
+ return cp_stat64(statbuf, &stat);
}
/*
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 5623c50d67b2..c45f415ce315 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -37,7 +37,7 @@ extern gate_desc idt_table[];
struct gdt_page {
struct desc_struct gdt[GDT_ENTRIES];
} __attribute__((aligned(PAGE_SIZE)));
-DECLARE_PER_CPU(struct gdt_page, gdt_page);
+DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
{
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 039db6aa8e02..37555e52f980 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -26,7 +26,7 @@ typedef struct {
#endif
} ____cacheline_aligned irq_cpustat_t;
-DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
+DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
#define MAX_HARDIRQS_PER_CPU NR_VECTORS
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index 0f4ee7148afe..faae1996487b 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -5,7 +5,6 @@
#define LHCALL_FLUSH_ASYNC 0
#define LHCALL_LGUEST_INIT 1
#define LHCALL_SHUTDOWN 2
-#define LHCALL_LOAD_GDT 3
#define LHCALL_NEW_PGTABLE 4
#define LHCALL_FLUSH_TLB 5
#define LHCALL_LOAD_IDT_ENTRY 6
@@ -17,6 +16,7 @@
#define LHCALL_SET_PMD 15
#define LHCALL_LOAD_TLS 16
#define LHCALL_NOTIFY 17
+#define LHCALL_LOAD_GDT_ENTRY 18
#define LGUEST_TRAP_ENTRY 0x1F
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 563933e06a35..4f8c199584e7 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -137,6 +137,7 @@ DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
enum mcp_flags {
MCP_TIMESTAMP = (1 << 0), /* log time stamp */
MCP_UC = (1 << 1), /* log uncorrected errors */
+ MCP_DONTLOG = (1 << 2), /* only clear, don't log */
};
extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index 2cd07b9422f4..7af14e512f97 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -18,9 +18,5 @@ extern int free_memtype(u64 start, u64 end);
extern int kernel_map_sync_memtype(u64 base, unsigned long size,
unsigned long flag);
-extern void map_devmem(unsigned long pfn, unsigned long size,
- struct pgprot vma_prot);
-extern void unmap_devmem(unsigned long pfn, unsigned long size,
- struct pgprot vma_prot);
#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index fcf4d92e7e04..c2cceae709c8 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -138,7 +138,7 @@ extern struct tss_struct doublefault_tss;
extern __u32 cleared_cpu_caps[NCAPINTS];
#ifdef CONFIG_SMP
-DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
#define cpu_data(cpu) per_cpu(cpu_info, cpu)
#define current_cpu_data __get_cpu_var(cpu_info)
#else
@@ -270,7 +270,7 @@ struct tss_struct {
} ____cacheline_aligned;
-DECLARE_PER_CPU(struct tss_struct, init_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
/*
* Save the original ist values for checking stack pointers during debugging
@@ -393,7 +393,7 @@ union irq_stack_union {
};
};
-DECLARE_PER_CPU(union irq_stack_union, irq_stack_union);
+DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union);
DECLARE_INIT_PER_CPU(irq_stack_union);
DECLARE_PER_CPU(char *, irq_stack_ptr);
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index d5cd6c586881..a4737dddfd58 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -50,7 +50,7 @@
#ifdef CONFIG_X86_64
#define NEED_PSE 0
#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
-#define NEED_PGE (1<<(X86_FEATURE_PGE & 31))
+#define NEED_PGE 0
#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31))
#define NEED_XMM (1<<(X86_FEATURE_XMM & 31))
#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31))
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index e2927c5f45b1..a5ecc9c33e92 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -152,7 +152,7 @@ struct tlb_state {
struct mm_struct *active_mm;
int state;
};
-DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
static inline void reset_lazy_tlbstate(void)
{
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 892b119dba6f..f44b49abca49 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -200,7 +200,7 @@ static inline void arch_fix_phys_package_id(int num, u32 slot)
}
struct pci_bus;
-void set_pci_bus_resources_arch_default(struct pci_bus *b);
+void x86_pci_root_bus_res_quirks(struct pci_bus *b);
#ifdef CONFIG_SMP
#define mc_capable() (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids)
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index db68ac8a5ac2..2cae46c7c8a2 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -17,6 +17,11 @@
/* ========================================================================= */
/* UVH_BAU_DATA_CONFIG */
/* ========================================================================= */
+#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
+#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
+/* 1011 timebase 7 (168millisec) * 3 ticks -> 500ms */
#define UVH_BAU_DATA_CONFIG 0x61680UL
#define UVH_BAU_DATA_CONFIG_32 0x0438
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 1a918dde46b5..018a0a400799 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -124,7 +124,8 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
/* VIRT <-> MACHINE conversion */
#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v))))
-#define virt_to_mfn(v) (pfn_to_mfn(PFN_DOWN(__pa(v))))
+#define virt_to_pfn(v) (PFN_DOWN(__pa(v)))
+#define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v)))
#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
static inline unsigned long pte_mfn(pte_t pte)
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 42c33cebf00f..8c0be0902dac 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -49,10 +49,10 @@
#define IVHD_DEV_EXT_SELECT 0x46
#define IVHD_DEV_EXT_SELECT_RANGE 0x47
-#define IVHD_FLAG_HT_TUN_EN 0x00
-#define IVHD_FLAG_PASSPW_EN 0x01
-#define IVHD_FLAG_RESPASSPW_EN 0x02
-#define IVHD_FLAG_ISOC_EN 0x03
+#define IVHD_FLAG_HT_TUN_EN_MASK 0x01
+#define IVHD_FLAG_PASSPW_EN_MASK 0x02
+#define IVHD_FLAG_RESPASSPW_EN_MASK 0x04
+#define IVHD_FLAG_ISOC_EN_MASK 0x08
#define IVMD_FLAG_EXCL_RANGE 0x08
#define IVMD_FLAG_UNITY_MAP 0x01
@@ -569,19 +569,19 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
* First set the recommended feature enable bits from ACPI
* into the IOMMU control registers
*/
- h->flags & IVHD_FLAG_HT_TUN_EN ?
+ h->flags & IVHD_FLAG_HT_TUN_EN_MASK ?
iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
- h->flags & IVHD_FLAG_PASSPW_EN ?
+ h->flags & IVHD_FLAG_PASSPW_EN_MASK ?
iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
- h->flags & IVHD_FLAG_RESPASSPW_EN ?
+ h->flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
- h->flags & IVHD_FLAG_ISOC_EN ?
+ h->flags & IVHD_FLAG_ISOC_EN_MASK ?
iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
iommu_feature_disable(iommu, CONTROL_ISOC_EN);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index a2789e42e162..30da617d18e4 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3670,12 +3670,14 @@ int arch_setup_hpet_msi(unsigned int irq)
{
int ret;
struct msi_msg msg;
+ struct irq_desc *desc = irq_to_desc(irq);
ret = msi_compose_msg(NULL, irq, &msg);
if (ret < 0)
return ret;
hpet_msi_write(irq, &msg);
+ desc->status |= IRQ_MOVE_PCNTXT;
set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq,
"edge");
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index d6bd62407152..ce4fbfa315a1 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -138,7 +138,7 @@ int __init check_nmi_watchdog(void)
if (!prev_nmi_count)
goto error;
- alloc_cpumask_var(&backtrace_mask, GFP_KERNEL);
+ alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO);
printk(KERN_INFO "Testing NMI watchdog ... ");
#ifdef CONFIG_SMP
@@ -414,7 +414,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
touched = 1;
}
- if (cpumask_test_cpu(cpu, backtrace_mask)) {
+ /* We can be called before check_nmi_watchdog, hence NULL check. */
+ if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) {
static DEFINE_SPINLOCK(lock); /* Serialise the printks */
spin_lock(&lock);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 1248318436e8..2bda69352976 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -19,6 +19,7 @@
#include <linux/timer.h>
#include <linux/cpu.h>
#include <linux/init.h>
+#include <linux/io.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
@@ -34,6 +35,17 @@ DEFINE_PER_CPU(int, x2apic_extra_bits);
static enum uv_system_type uv_system_type;
+static int early_get_nodeid(void)
+{
+ union uvh_node_id_u node_id;
+ unsigned long *mmr;
+
+ mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
+ node_id.v = *mmr;
+ early_iounmap(mmr, sizeof(*mmr));
+ return node_id.s.node_id;
+}
+
static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
if (!strcmp(oem_id, "SGI")) {
@@ -42,6 +54,8 @@ static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
else if (!strcmp(oem_table_id, "UVX"))
uv_system_type = UV_X2APIC;
else if (!strcmp(oem_table_id, "UVH")) {
+ __get_cpu_var(x2apic_extra_bits) =
+ early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1);
uv_system_type = UV_NON_UNIQUE_APIC;
return 1;
}
@@ -549,7 +563,8 @@ void __init uv_system_init(void)
unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
int max_pnode = 0;
- unsigned long mmr_base, present;
+ unsigned long mmr_base, present, paddr;
+ unsigned short pnode_mask;
map_low_mmrs();
@@ -592,6 +607,7 @@ void __init uv_system_init(void)
}
}
+ pnode_mask = (1 << n_val) - 1;
node_id.v = uv_read_local_mmr(UVH_NODE_ID);
gnode_upper = (((unsigned long)node_id.s.node_id) &
~((1 << n_val) - 1)) << m_val;
@@ -615,7 +631,7 @@ void __init uv_system_init(void)
uv_cpu_hub_info(cpu)->numa_blade_id = blade;
uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
uv_cpu_hub_info(cpu)->pnode = pnode;
- uv_cpu_hub_info(cpu)->pnode_mask = (1 << n_val) - 1;
+ uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
@@ -631,6 +647,17 @@ void __init uv_system_init(void)
lcpu, blade);
}
+ /* Add blade/pnode info for nodes without cpus */
+ for_each_online_node(nid) {
+ if (uv_node_to_blade[nid] >= 0)
+ continue;
+ paddr = node_start_pfn(nid) << PAGE_SHIFT;
+ paddr = uv_soc_phys_ram_to_gpa(paddr);
+ pnode = (paddr >> m_val) & pnode_mask;
+ blade = boot_pnode_to_blade(pnode);
+ uv_node_to_blade[nid] = blade;
+ }
+
map_gru_high(max_pnode);
map_mmr_high(max_pnode);
map_config_high(max_pnode);
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index f63882728d91..63a88e1f987d 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -182,7 +182,8 @@ void uv_bios_init(void)
memcpy(&uv_systab, tab, sizeof(struct uv_systab));
iounmap(tab);
- printk(KERN_INFO "EFI UV System Table Revision %d\n", tab->revision);
+ printk(KERN_INFO "EFI UV System Table Revision %d\n",
+ uv_systab.revision);
}
#else /* !CONFIG_EFI */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c4f667896c28..c1caefc82e62 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1203,6 +1203,8 @@ void __cpuinit cpu_init(void)
load_TR_desc();
load_LDT(&init_mm.context);
+ t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+
#ifdef CONFIG_DOUBLEFAULT
/* Set up doublefault TSS pointer in the GDT */
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 3e3cd3db7a0c..208ecf6643df 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -65,14 +65,18 @@ enum {
struct acpi_cpufreq_data {
struct acpi_processor_performance *acpi_data;
struct cpufreq_frequency_table *freq_table;
- unsigned int max_freq;
unsigned int resume;
unsigned int cpu_feature;
- u64 saved_aperf, saved_mperf;
};
static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
+struct acpi_msr_data {
+ u64 saved_aperf, saved_mperf;
+};
+
+static DEFINE_PER_CPU(struct acpi_msr_data, msr_data);
+
DEFINE_TRACE(power_mark);
/* acpi_perf_data is a pointer to percpu data. */
@@ -204,7 +208,13 @@ static void drv_read(struct drv_cmd *cmd)
static void drv_write(struct drv_cmd *cmd)
{
+ int this_cpu;
+
+ this_cpu = get_cpu();
+ if (cpumask_test_cpu(this_cpu, cmd->mask))
+ do_drv_write(cmd);
smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
+ put_cpu();
}
static u32 get_cur_val(const struct cpumask *mask)
@@ -277,15 +287,15 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
unsigned int perf_percent;
unsigned int retval;
- if (smp_call_function_single(cpu, read_measured_perf_ctrs, &cur, 1))
+ if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1))
return 0;
cur.aperf.whole = readin.aperf.whole -
- per_cpu(drv_data, cpu)->saved_aperf;
+ per_cpu(msr_data, cpu).saved_aperf;
cur.mperf.whole = readin.mperf.whole -
- per_cpu(drv_data, cpu)->saved_mperf;
- per_cpu(drv_data, cpu)->saved_aperf = readin.aperf.whole;
- per_cpu(drv_data, cpu)->saved_mperf = readin.mperf.whole;
+ per_cpu(msr_data, cpu).saved_mperf;
+ per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole;
+ per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole;
#ifdef __i386__
/*
@@ -329,7 +339,7 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
#endif
- retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100;
+ retval = (policy->cpuinfo.max_freq * perf_percent) / 100;
return retval;
}
@@ -682,16 +692,11 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
/* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
policy->cpuinfo.transition_latency > 20 * 1000) {
- static int print_once;
policy->cpuinfo.transition_latency = 20 * 1000;
- if (!print_once) {
- print_once = 1;
- printk(KERN_INFO "Capping off P-state tranision latency"
- " at 20 uS\n");
- }
+ printk_once(KERN_INFO "Capping off P-state tranision"
+ " latency at 20 uS\n");
}
- data->max_freq = perf->states[0].core_frequency * 1000;
/* table init */
for (i = 0; i < perf->state_count; i++) {
if (i > 0 && perf->states[i].core_frequency >=
@@ -710,6 +715,9 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
if (result)
goto err_freqfree;
+ if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq)
+ printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n");
+
switch (perf->control_register.space_id) {
case ACPI_ADR_SPACE_SYSTEM_IO:
/* Current speed is unknown and not detectable by IO port */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 863f89568b1a..6fb0b359d2a5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -239,9 +239,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
-
- mce_log(&m);
- add_taint(TAINT_MACHINE_CHECK);
+ if (!(flags & MCP_DONTLOG)) {
+ mce_log(&m);
+ add_taint(TAINT_MACHINE_CHECK);
+ }
/*
* Clear state for this bank.
@@ -452,13 +453,14 @@ void mce_log_therm_throt_event(__u64 status)
*/
static int check_interval = 5 * 60; /* 5 minutes */
-static int next_interval; /* in jiffies */
+static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
static void mcheck_timer(unsigned long);
static DEFINE_PER_CPU(struct timer_list, mce_timer);
static void mcheck_timer(unsigned long data)
{
struct timer_list *t = &per_cpu(mce_timer, data);
+ int *n;
WARN_ON(smp_processor_id() != data);
@@ -470,14 +472,14 @@ static void mcheck_timer(unsigned long data)
* Alert userspace if needed. If we logged an MCE, reduce the
* polling interval, otherwise increase the polling interval.
*/
+ n = &__get_cpu_var(next_interval);
if (mce_notify_user()) {
- next_interval = max(next_interval/2, HZ/100);
+ *n = max(*n/2, HZ/100);
} else {
- next_interval = min(next_interval * 2,
- (int)round_jiffies_relative(check_interval*HZ));
+ *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
}
- t->expires = jiffies + next_interval;
+ t->expires = jiffies + *n;
add_timer(t);
}
@@ -584,7 +586,7 @@ static void mce_init(void *dummy)
* Log the machine checks left over from the previous reset.
*/
bitmap_fill(all_banks, MAX_NR_BANKS);
- machine_check_poll(MCP_UC, &all_banks);
+ machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
set_in_cr4(X86_CR4_MCE);
@@ -632,14 +634,13 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
static void mce_init_timer(void)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
+ int *n = &__get_cpu_var(next_interval);
- /* data race harmless because everyone sets to the same value */
- if (!next_interval)
- next_interval = check_interval * HZ;
- if (!next_interval)
+ *n = check_interval * HZ;
+ if (!*n)
return;
setup_timer(t, mcheck_timer, smp_processor_id());
- t->expires = round_jiffies(jiffies + next_interval);
+ t->expires = round_jiffies(jiffies + *n);
add_timer(t);
}
@@ -907,7 +908,6 @@ static void mce_cpu_restart(void *data)
/* Reinit MCEs after user configuration changes */
static void mce_restart(void)
{
- next_interval = check_interval * HZ;
on_each_cpu(mce_cpu_restart, NULL, 1);
}
@@ -1110,7 +1110,8 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
- t->expires = round_jiffies(jiffies + next_interval);
+ t->expires = round_jiffies(jiffies +
+ __get_cpu_var(next_interval));
add_timer_on(t, cpu);
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
break;
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index f93047fed791..d5e30397246b 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -14,7 +14,7 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
if (c->x86_max_cores * smp_num_siblings > 1) {
seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
seq_printf(m, "siblings\t: %d\n",
- cpumask_weight(cpu_sibling_mask(cpu)));
+ cpumask_weight(cpu_core_mask(cpu)));
seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
seq_printf(m, "apicid\t\t: %d\n", c->apicid);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1ac99865591c..987f91f0f755 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1397,7 +1397,10 @@ ENTRY(paranoid_exit)
paranoid_swapgs:
TRACE_IRQS_IRETQ 0
SWAPGS_UNSAFE_STACK
+ RESTORE_ALL 8
+ jmp irq_return
paranoid_restore:
+ TRACE_IRQS_IRETQ 0
RESTORE_ALL 8
jmp irq_return
paranoid_userspace:
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 648b3a2a3a44..81408b93f887 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -236,6 +236,10 @@ static void hpet_stop_counter(void)
unsigned long cfg = hpet_readl(HPET_CFG);
cfg &= ~HPET_CFG_ENABLE;
hpet_writel(cfg, HPET_CFG);
+}
+
+static void hpet_reset_counter(void)
+{
hpet_writel(0, HPET_COUNTER);
hpet_writel(0, HPET_COUNTER + 4);
}
@@ -250,6 +254,7 @@ static void hpet_start_counter(void)
static void hpet_restart_counter(void)
{
hpet_stop_counter();
+ hpet_reset_counter();
hpet_start_counter();
}
@@ -309,7 +314,7 @@ static int hpet_setup_msi_irq(unsigned int irq);
static void hpet_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt, int timer)
{
- unsigned long cfg;
+ unsigned long cfg, cmp, now;
uint64_t delta;
switch (mode) {
@@ -317,12 +322,23 @@ static void hpet_set_mode(enum clock_event_mode mode,
hpet_stop_counter();
delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
delta >>= evt->shift;
+ now = hpet_readl(HPET_COUNTER);
+ cmp = now + (unsigned long) delta;
cfg = hpet_readl(HPET_Tn_CFG(timer));
/* Make sure we use edge triggered interrupts */
cfg &= ~HPET_TN_LEVEL;
cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
HPET_TN_SETVAL | HPET_TN_32BIT;
hpet_writel(cfg, HPET_Tn_CFG(timer));
+ hpet_writel(cmp, HPET_Tn_CMP(timer));
+ udelay(1);
+ /*
+ * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
+ * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
+ * bit is automatically cleared after the first write.
+ * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
+ * Publication # 24674)
+ */
hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer));
hpet_start_counter();
hpet_print_config();
@@ -722,7 +738,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
/*
* Clock source related code
*/
-static cycle_t read_hpet(void)
+static cycle_t read_hpet(struct clocksource *cs)
{
return (cycle_t)hpet_readl(HPET_COUNTER);
}
@@ -756,7 +772,7 @@ static int hpet_clocksource_register(void)
hpet_restart_counter();
/* Verify whether hpet counter works */
- t1 = read_hpet();
+ t1 = hpet_readl(HPET_COUNTER);
rdtscll(start);
/*
@@ -770,7 +786,7 @@ static int hpet_clocksource_register(void)
rdtscll(now);
} while ((now - start) < 200000UL);
- if (t1 == read_hpet()) {
+ if (t1 == hpet_readl(HPET_COUNTER)) {
printk(KERN_WARNING
"HPET counter not counting. HPET disabled\n");
return -ENODEV;
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 3475440baa54..c2e0bb0890d4 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -129,7 +129,7 @@ void __init setup_pit_timer(void)
* to just read by itself. So use jiffies to emulate a free
* running counter:
*/
-static cycle_t pit_read(void)
+static cycle_t pit_read(struct clocksource *cs)
{
static int old_count;
static u32 old_jifs;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 137f2e8132df..223af43f1526 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -77,6 +77,11 @@ static cycle_t kvm_clock_read(void)
return ret;
}
+static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
+{
+ return kvm_clock_read();
+}
+
/*
* If we don't do that, there is the possibility that the guest
* will calibrate under heavy load - thus, getting a lower lpj -
@@ -107,7 +112,7 @@ static void kvm_get_preset_lpj(void)
static struct clocksource kvm_clock = {
.name = "kvm-clock",
- .read = kvm_clock_read,
+ .read = kvm_clock_get_cycles,
.rating = 400,
.mask = CLOCKSOURCE_MASK(64),
.mult = 1 << KVM_SCALE,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index a0f3851ef310..98c470c069d1 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -108,40 +108,29 @@ struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
EXPORT_SYMBOL_GPL(ucode_cpu_info);
#ifdef CONFIG_MICROCODE_OLD_INTERFACE
-struct update_for_cpu {
- const void __user *buf;
- size_t size;
-};
-
-static long update_for_cpu(void *_ufc)
-{
- struct update_for_cpu *ufc = _ufc;
- int error;
-
- error = microcode_ops->request_microcode_user(smp_processor_id(),
- ufc->buf, ufc->size);
- if (error < 0)
- return error;
- if (!error)
- microcode_ops->apply_microcode(smp_processor_id());
- return error;
-}
-
static int do_microcode_update(const void __user *buf, size_t size)
{
+ cpumask_t old;
int error = 0;
int cpu;
- struct update_for_cpu ufc = { .buf = buf, .size = size };
+
+ old = current->cpus_allowed;
for_each_online_cpu(cpu) {
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
if (!uci->valid)
continue;
- error = work_on_cpu(cpu, update_for_cpu, &ufc);
+
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
+ error = microcode_ops->request_microcode_user(cpu, buf, size);
if (error < 0)
- break;
+ goto out;
+ if (!error)
+ microcode_ops->apply_microcode(cpu);
}
+out:
+ set_cpus_allowed_ptr(current, &old);
return error;
}
@@ -391,8 +380,6 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
return err;
err = microcode_init_cpu(cpu);
- if (err)
- sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
return err;
}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 34f12e9996ed..221a3853e268 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -50,7 +50,7 @@ static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
}
-struct dma_map_ops swiotlb_dma_ops = {
+static struct dma_map_ops swiotlb_dma_ops = {
.mapping_error = swiotlb_dma_mapping_error,
.alloc_coherent = x86_swiotlb_alloc_coherent,
.free_coherent = swiotlb_free_coherent,
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index e95022e4f5d5..7563b31b4f03 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -261,8 +261,6 @@ static void old_ich_force_enable_hpet_user(struct pci_dev *dev)
{
if (hpet_force_user)
old_ich_force_enable_hpet(dev);
- else
- hpet_print_force_info();
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index deb5ebb32c3b..ed0c33761e6d 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -25,6 +25,8 @@ static int uv_bau_retry_limit __read_mostly;
/* position of pnode (which is nasid>>1): */
static int uv_nshift __read_mostly;
+/* base pnode in this partition */
+static int uv_partition_base_pnode __read_mostly;
static unsigned long uv_mmask __read_mostly;
@@ -32,6 +34,34 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
static DEFINE_PER_CPU(struct bau_control, bau_control);
/*
+ * Determine the first node on a blade.
+ */
+static int __init blade_to_first_node(int blade)
+{
+ int node, b;
+
+ for_each_online_node(node) {
+ b = uv_node_to_blade_id(node);
+ if (blade == b)
+ return node;
+ }
+ return -1; /* shouldn't happen */
+}
+
+/*
+ * Determine the apicid of the first cpu on a blade.
+ */
+static int __init blade_to_first_apicid(int blade)
+{
+ int cpu;
+
+ for_each_present_cpu(cpu)
+ if (blade == uv_cpu_to_blade_id(cpu))
+ return per_cpu(x86_cpu_to_apicid, cpu);
+ return -1;
+}
+
+/*
* Free a software acknowledge hardware resource by clearing its Pending
* bit. This will return a reply to the sender.
* If the message has timed out, a reply has already been sent by the
@@ -67,7 +97,7 @@ static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
cpu = uv_blade_processor_id();
msg->number_of_cpus =
- uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
+ uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
this_cpu_mask = 1UL << cpu;
if (msp->seen_by.bits & this_cpu_mask)
return;
@@ -215,14 +245,14 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
* Returns @flush_mask if some remote flushing remains to be done. The
* mask will have some bits still set.
*/
-const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
+const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
struct bau_desc *bau_desc,
struct cpumask *flush_mask)
{
int completion_status = 0;
int right_shift;
int tries = 0;
- int blade;
+ int pnode;
int bit;
unsigned long mmr_offset;
unsigned long index;
@@ -265,8 +295,8 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
* use the IPI method of shootdown on them.
*/
for_each_cpu(bit, flush_mask) {
- blade = uv_cpu_to_blade_id(bit);
- if (blade == this_blade)
+ pnode = uv_cpu_to_pnode(bit);
+ if (pnode == this_pnode)
continue;
cpumask_clear_cpu(bit, flush_mask);
}
@@ -309,16 +339,16 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask);
int i;
int bit;
- int blade;
+ int pnode;
int uv_cpu;
- int this_blade;
+ int this_pnode;
int locals = 0;
struct bau_desc *bau_desc;
cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
uv_cpu = uv_blade_processor_id();
- this_blade = uv_numa_blade_id();
+ this_pnode = uv_hub_info->pnode;
bau_desc = __get_cpu_var(bau_control).descriptor_base;
bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
@@ -326,13 +356,14 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
i = 0;
for_each_cpu(bit, flush_mask) {
- blade = uv_cpu_to_blade_id(bit);
- BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
- if (blade == this_blade) {
+ pnode = uv_cpu_to_pnode(bit);
+ BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1));
+ if (pnode == this_pnode) {
locals++;
continue;
}
- bau_node_set(blade, &bau_desc->distribution);
+ bau_node_set(pnode - uv_partition_base_pnode,
+ &bau_desc->distribution);
i++;
}
if (i == 0) {
@@ -350,7 +381,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
bau_desc->payload.address = va;
bau_desc->payload.sending_cpu = cpu;
- return uv_flush_send_and_wait(uv_cpu, this_blade, bau_desc, flush_mask);
+ return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask);
}
/*
@@ -418,24 +449,58 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
set_irq_regs(old_regs);
}
+/*
+ * uv_enable_timeouts
+ *
+ * Each target blade (i.e. blades that have cpu's) needs to have
+ * shootdown message timeouts enabled. The timeout does not cause
+ * an interrupt, but causes an error message to be returned to
+ * the sender.
+ */
static void uv_enable_timeouts(void)
{
- int i;
int blade;
- int last_blade;
+ int nblades;
int pnode;
- int cur_cpu = 0;
- unsigned long apicid;
+ unsigned long mmr_image;
- last_blade = -1;
- for_each_online_node(i) {
- blade = uv_node_to_blade_id(i);
- if (blade == last_blade)
+ nblades = uv_num_possible_blades();
+
+ for (blade = 0; blade < nblades; blade++) {
+ if (!uv_blade_nr_possible_cpus(blade))
continue;
- last_blade = blade;
- apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
+
pnode = uv_blade_to_pnode(blade);
- cur_cpu += uv_blade_nr_possible_cpus(i);
+ mmr_image =
+ uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
+ /*
+ * Set the timeout period and then lock it in, in three
+ * steps; captures and locks in the period.
+ *
+ * To program the period, the SOFT_ACK_MODE must be off.
+ */
+ mmr_image &= ~((unsigned long)1 <<
+ UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+ uv_write_global_mmr64
+ (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+ /*
+ * Set the 4-bit period.
+ */
+ mmr_image &= ~((unsigned long)0xf <<
+ UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+ mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
+ UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+ uv_write_global_mmr64
+ (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+ /*
+ * Subsequent reversals of the timebase bit (3) cause an
+ * immediate timeout of one or all INTD resources as
+ * indicated in bits 2:0 (7 causes all of them to timeout).
+ */
+ mmr_image |= ((unsigned long)1 <<
+ UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+ uv_write_global_mmr64
+ (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
}
}
@@ -482,8 +547,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
stat->requestee, stat->onetlb, stat->alltlb,
stat->s_retry, stat->d_retry, stat->ptc_i);
seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",
- uv_read_global_mmr64(uv_blade_to_pnode
- (uv_cpu_to_blade_id(cpu)),
+ uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
stat->sflush, stat->dflush,
stat->retriesok, stat->nomsg,
@@ -617,16 +681,18 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node)
* finish the initialization of the per-blade control structures
*/
static void __init
-uv_table_bases_finish(int blade, int node, int cur_cpu,
+uv_table_bases_finish(int blade,
struct bau_control *bau_tablesp,
struct bau_desc *adp)
{
struct bau_control *bcp;
- int i;
+ int cpu;
- for (i = cur_cpu; i < cur_cpu + uv_blade_nr_possible_cpus(blade); i++) {
- bcp = (struct bau_control *)&per_cpu(bau_control, i);
+ for_each_present_cpu(cpu) {
+ if (blade != uv_cpu_to_blade_id(cpu))
+ continue;
+ bcp = (struct bau_control *)&per_cpu(bau_control, cpu);
bcp->bau_msg_head = bau_tablesp->va_queue_first;
bcp->va_queue_first = bau_tablesp->va_queue_first;
bcp->va_queue_last = bau_tablesp->va_queue_last;
@@ -649,11 +715,10 @@ uv_activation_descriptor_init(int node, int pnode)
struct bau_desc *adp;
struct bau_desc *ad2;
- adp = (struct bau_desc *)
- kmalloc_node(16384, GFP_KERNEL, node);
+ adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node);
BUG_ON(!adp);
- pa = __pa((unsigned long)adp);
+ pa = uv_gpa(adp); /* need the real nasid*/
n = pa >> uv_nshift;
m = pa & uv_mmask;
@@ -667,8 +732,12 @@ uv_activation_descriptor_init(int node, int pnode)
for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
memset(ad2, 0, sizeof(struct bau_desc));
ad2->header.sw_ack_flag = 1;
- ad2->header.base_dest_nodeid =
- uv_blade_to_pnode(uv_cpu_to_blade_id(0));
+ /*
+ * base_dest_nodeid is the first node in the partition, so
+ * the bit map will indicate partition-relative node numbers.
+ * note that base_dest_nodeid is actually a nasid.
+ */
+ ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
ad2->header.command = UV_NET_ENDPOINT_INTD;
ad2->header.int_both = 1;
/*
@@ -686,6 +755,8 @@ static struct bau_payload_queue_entry * __init
uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
{
struct bau_payload_queue_entry *pqp;
+ unsigned long pa;
+ int pn;
char *cp;
pqp = (struct bau_payload_queue_entry *) kmalloc_node(
@@ -696,10 +767,14 @@ uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
cp = (char *)pqp + 31;
pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
bau_tablesp->va_queue_first = pqp;
+ /*
+ * need the pnode of where the memory was really allocated
+ */
+ pa = uv_gpa(pqp);
+ pn = pa >> uv_nshift;
uv_write_global_mmr64(pnode,
UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
- ((unsigned long)pnode <<
- UV_PAYLOADQ_PNODE_SHIFT) |
+ ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
uv_physnodeaddr(pqp));
uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
uv_physnodeaddr(pqp));
@@ -715,8 +790,9 @@ uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
/*
* Initialization of each UV blade's structures
*/
-static int __init uv_init_blade(int blade, int node, int cur_cpu)
+static int __init uv_init_blade(int blade)
{
+ int node;
int pnode;
unsigned long pa;
unsigned long apicid;
@@ -724,16 +800,17 @@ static int __init uv_init_blade(int blade, int node, int cur_cpu)
struct bau_payload_queue_entry *pqp;
struct bau_control *bau_tablesp;
+ node = blade_to_first_node(blade);
bau_tablesp = uv_table_bases_init(blade, node);
pnode = uv_blade_to_pnode(blade);
adp = uv_activation_descriptor_init(node, pnode);
pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
- uv_table_bases_finish(blade, node, cur_cpu, bau_tablesp, adp);
+ uv_table_bases_finish(blade, bau_tablesp, adp);
/*
* the below initialization can't be in firmware because the
* messaging IRQ will be determined by the OS
*/
- apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
+ apicid = blade_to_first_apicid(blade);
pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
if ((pa & 0xff) != UV_BAU_MESSAGE) {
uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
@@ -748,9 +825,7 @@ static int __init uv_init_blade(int blade, int node, int cur_cpu)
static int __init uv_bau_init(void)
{
int blade;
- int node;
int nblades;
- int last_blade;
int cur_cpu;
if (!is_uv_system())
@@ -763,29 +838,21 @@ static int __init uv_bau_init(void)
uv_bau_retry_limit = 1;
uv_nshift = uv_hub_info->n_val;
uv_mmask = (1UL << uv_hub_info->n_val) - 1;
- nblades = 0;
- last_blade = -1;
- cur_cpu = 0;
- for_each_online_node(node) {
- blade = uv_node_to_blade_id(node);
- if (blade == last_blade)
- continue;
- last_blade = blade;
- nblades++;
- }
+ nblades = uv_num_possible_blades();
+
uv_bau_table_bases = (struct bau_control **)
kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
BUG_ON(!uv_bau_table_bases);
- last_blade = -1;
- for_each_online_node(node) {
- blade = uv_node_to_blade_id(node);
- if (blade == last_blade)
- continue;
- last_blade = blade;
- uv_init_blade(blade, node, cur_cpu);
- cur_cpu += uv_blade_nr_possible_cpus(blade);
- }
+ uv_partition_base_pnode = 0x7fffffff;
+ for (blade = 0; blade < nblades; blade++)
+ if (uv_blade_nr_possible_cpus(blade) &&
+ (uv_blade_to_pnode(blade) < uv_partition_base_pnode))
+ uv_partition_base_pnode = uv_blade_to_pnode(blade);
+ for (blade = 0; blade < nblades; blade++)
+ if (uv_blade_nr_possible_cpus(blade))
+ uv_init_blade(blade);
+
alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
uv_enable_timeouts();
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7a567ebe6361..d57de05dc430 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -699,7 +699,7 @@ static struct clocksource clocksource_tsc;
* code, which is necessary to support wrapping clocksources like pm
* timer.
*/
-static cycle_t read_tsc(void)
+static cycle_t read_tsc(struct clocksource *cs)
{
cycle_t ret = (cycle_t)get_cycles();
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
index 67f9b9dbf800..36afb98675a4 100644
--- a/arch/x86/kernel/uv_sysfs.c
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -21,6 +21,7 @@
#include <linux/sysdev.h>
#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
struct kobject *sgi_uv_kobj;
@@ -47,6 +48,9 @@ static int __init sgi_uv_sysfs_init(void)
{
unsigned long ret;
+ if (!is_uv_system())
+ return -ENODEV;
+
if (!sgi_uv_kobj)
sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
if (!sgi_uv_kobj) {
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 2ffb6c53326e..583f11d5c480 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -29,7 +29,7 @@
#define RTC_NAME "sgi_rtc"
-static cycle_t uv_read_rtc(void);
+static cycle_t uv_read_rtc(struct clocksource *cs);
static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
static void uv_rtc_timer_setup(enum clock_event_mode,
struct clock_event_device *);
@@ -123,7 +123,7 @@ static int uv_setup_intr(int cpu, u64 expires)
/* Initialize comparator value */
uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
- return (expires < uv_read_rtc() && !uv_intr_pending(pnode));
+ return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode));
}
/*
@@ -256,7 +256,7 @@ static int uv_rtc_unset_timer(int cpu)
spin_lock_irqsave(&head->lock, flags);
- if (head->next_cpu == bcpu && uv_read_rtc() >= *t)
+ if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t)
rc = 1;
*t = ULLONG_MAX;
@@ -278,7 +278,7 @@ static int uv_rtc_unset_timer(int cpu)
/*
* Read the RTC.
*/
-static cycle_t uv_read_rtc(void)
+static cycle_t uv_read_rtc(struct clocksource *cs)
{
return (cycle_t)uv_read_local_mmr(UVH_RTC);
}
@@ -291,7 +291,7 @@ static int uv_rtc_next_event(unsigned long delta,
{
int ced_cpu = cpumask_first(ced->cpumask);
- return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc());
+ return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc(NULL));
}
/*
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index d303369a7bad..2b3eb82efeeb 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -283,7 +283,7 @@ void __devinit vmi_time_ap_init(void)
/** vmi clocksource */
static struct clocksource clocksource_vmi;
-static cycle_t read_real_cycles(void)
+static cycle_t read_real_cycles(struct clocksource *cs)
{
cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
return max(ret, clocksource_vmi.cycle_last);
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 0a5b04aa98f1..c5ee17e8c6d9 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -89,7 +89,7 @@ int save_i387_xstate(void __user *buf)
if (!used_math())
return 0;
- clear_used_math(); /* trigger finit */
+
if (task_thread_info(tsk)->status & TS_USEDFPU) {
/*
* Start with clearing the user buffer. This will present a
@@ -114,6 +114,8 @@ int save_i387_xstate(void __user *buf)
return -1;
}
+ clear_used_math(); /* trigger finit */
+
if (task_thread_info(tsk)->status & TS_XSAVE) {
struct _fpstate __user *fx = buf;
struct _xstate __user *x = buf;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2a36f7f7c4c7..b6caf1329b1b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1248,7 +1248,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
sp->gfn = gfn;
sp->role = role;
- sp->global = role.cr4_pge;
+ sp->global = 0;
hlist_add_head(&sp->hash_link, bucket);
if (!direct) {
if (rmap_write_protect(vcpu->kvm, gfn))
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8ca100a9ecac..7c1ce5ac6131 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2775,6 +2775,9 @@ out:
void kvm_arch_exit(void)
{
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
kvm_x86_ops = NULL;
kvm_mmu_module_exit();
}
@@ -4159,6 +4162,11 @@ EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
{
+ if (vcpu->arch.time_page) {
+ kvm_release_page_dirty(vcpu->arch.time_page);
+ vcpu->arch.time_page = NULL;
+ }
+
kvm_x86_ops->vcpu_free(vcpu);
}
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index e94a11e42f98..ca7ec44bafc3 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -273,15 +273,15 @@ static void lguest_load_idt(const struct desc_ptr *desc)
* controls the entire thing and the Guest asks it to make changes using the
* LOAD_GDT hypercall.
*
- * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY
- * hypercall and use that repeatedly to load a new IDT. I don't think it
- * really matters, but wouldn't it be nice if they were the same? Wouldn't
- * it be even better if you were the one to send the patch to fix it?
+ * This is the exactly like the IDT code.
*/
static void lguest_load_gdt(const struct desc_ptr *desc)
{
- BUG_ON((desc->size + 1) / 8 != GDT_ENTRIES);
- kvm_hypercall2(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES);
+ unsigned int i;
+ struct desc_struct *gdt = (void *)desc->address;
+
+ for (i = 0; i < (desc->size+1)/8; i++)
+ kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b);
}
/* For a single GDT entry which changes, we do the lazy thing: alter our GDT,
@@ -291,7 +291,9 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
const void *desc, int type)
{
native_write_gdt_entry(dt, entrynum, desc, type);
- kvm_hypercall2(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES);
+ /* Tell Host about this new entry. */
+ kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, entrynum,
+ dt[entrynum].a, dt[entrynum].b);
}
/* OK, I lied. There are three "thread local storage" GDT entries which change
@@ -661,7 +663,7 @@ static unsigned long lguest_tsc_khz(void)
/* If we can't use the TSC, the kernel falls back to our lower-priority
* "lguest_clock", where we read the time value given to us by the Host. */
-static cycle_t lguest_clock_read(void)
+static cycle_t lguest_clock_read(struct clocksource *cs)
{
unsigned long sec, nsec;
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 09daebfdb11c..8a450930834f 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -280,15 +280,16 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
return NULL;
area->phys_addr = phys_addr;
vaddr = (unsigned long) area->addr;
- if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
+
+ if (kernel_map_sync_memtype(phys_addr, size, prot_val)) {
free_memtype(phys_addr, phys_addr + size);
free_vm_area(area);
return NULL;
}
- if (ioremap_change_attr(vaddr, size, prot_val) < 0) {
+ if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
free_memtype(phys_addr, phys_addr + size);
- vunmap(area->addr);
+ free_vm_area(area);
return NULL;
}
@@ -374,7 +375,8 @@ static void __iomem *ioremap_default(resource_size_t phys_addr,
* - UC_MINUS for non-WB-able memory with no other conflicting mappings
* - Inherit from confliting mappings otherwise
*/
- err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
+ err = reserve_memtype(phys_addr, phys_addr + size,
+ _PAGE_CACHE_WB, &flags);
if (err < 0)
return NULL;
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 256ce643b0ba..16ccbd77917f 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -87,7 +87,7 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
{
struct kmmio_probe *p;
list_for_each_entry_rcu(p, &kmmio_probes, list) {
- if (addr >= p->addr && addr <= (p->addr + p->len))
+ if (addr >= p->addr && addr < (p->addr + p->len))
return p;
}
return NULL;
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 3daefa04ace5..d2530062fe00 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -257,7 +257,7 @@ void resume_map_numa_kva(pgd_t *pgd_base)
}
#endif
-static unsigned long calculate_numa_remap_pages(void)
+static __init unsigned long calculate_numa_remap_pages(void)
{
int nid;
unsigned long size, reserve_pages = 0;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index d73aaa892371..2d05a12029dc 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -188,6 +188,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
int nid;
+ if (!end)
+ return;
+
start = roundup(start, ZONE_ALIGN);
printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index d71e1b636ce6..797f9f107cb6 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -945,71 +945,94 @@ int _set_memory_uc(unsigned long addr, int numpages)
int set_memory_uc(unsigned long addr, int numpages)
{
+ int ret;
+
/*
* for now UC MINUS. see comments in ioremap_nocache()
*/
- if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
- _PAGE_CACHE_UC_MINUS, NULL))
- return -EINVAL;
+ ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+ _PAGE_CACHE_UC_MINUS, NULL);
+ if (ret)
+ goto out_err;
+
+ ret = _set_memory_uc(addr, numpages);
+ if (ret)
+ goto out_free;
- return _set_memory_uc(addr, numpages);
+ return 0;
+
+out_free:
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+out_err:
+ return ret;
}
EXPORT_SYMBOL(set_memory_uc);
int set_memory_array_uc(unsigned long *addr, int addrinarray)
{
- unsigned long start;
- unsigned long end;
- int i;
+ int i, j;
+ int ret;
+
/*
* for now UC MINUS. see comments in ioremap_nocache()
*/
for (i = 0; i < addrinarray; i++) {
- start = __pa(addr[i]);
- for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
- if (end != __pa(addr[i + 1]))
- break;
- i++;
- }
- if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
- goto out;
+ ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
+ _PAGE_CACHE_UC_MINUS, NULL);
+ if (ret)
+ goto out_free;
}
- return change_page_attr_set(addr, addrinarray,
+ ret = change_page_attr_set(addr, addrinarray,
__pgprot(_PAGE_CACHE_UC_MINUS), 1);
-out:
- for (i = 0; i < addrinarray; i++) {
- unsigned long tmp = __pa(addr[i]);
-
- if (tmp == start)
- break;
- for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
- if (end != __pa(addr[i + 1]))
- break;
- i++;
- }
- free_memtype(tmp, end);
- }
- return -EINVAL;
+ if (ret)
+ goto out_free;
+
+ return 0;
+
+out_free:
+ for (j = 0; j < i; j++)
+ free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
+
+ return ret;
}
EXPORT_SYMBOL(set_memory_array_uc);
int _set_memory_wc(unsigned long addr, int numpages)
{
- return change_page_attr_set(&addr, numpages,
+ int ret;
+ ret = change_page_attr_set(&addr, numpages,
+ __pgprot(_PAGE_CACHE_UC_MINUS), 0);
+
+ if (!ret) {
+ ret = change_page_attr_set(&addr, numpages,
__pgprot(_PAGE_CACHE_WC), 0);
+ }
+ return ret;
}
int set_memory_wc(unsigned long addr, int numpages)
{
+ int ret;
+
if (!pat_enabled)
return set_memory_uc(addr, numpages);
- if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
- _PAGE_CACHE_WC, NULL))
- return -EINVAL;
+ ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+ _PAGE_CACHE_WC, NULL);
+ if (ret)
+ goto out_err;
+
+ ret = _set_memory_wc(addr, numpages);
+ if (ret)
+ goto out_free;
+
+ return 0;
- return _set_memory_wc(addr, numpages);
+out_free:
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+out_err:
+ return ret;
}
EXPORT_SYMBOL(set_memory_wc);
@@ -1021,29 +1044,31 @@ int _set_memory_wb(unsigned long addr, int numpages)
int set_memory_wb(unsigned long addr, int numpages)
{
- free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+ int ret;
+
+ ret = _set_memory_wb(addr, numpages);
+ if (ret)
+ return ret;
- return _set_memory_wb(addr, numpages);
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+ return 0;
}
EXPORT_SYMBOL(set_memory_wb);
int set_memory_array_wb(unsigned long *addr, int addrinarray)
{
int i;
+ int ret;
- for (i = 0; i < addrinarray; i++) {
- unsigned long start = __pa(addr[i]);
- unsigned long end;
-
- for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
- if (end != __pa(addr[i + 1]))
- break;
- i++;
- }
- free_memtype(start, end);
- }
- return change_page_attr_clear(addr, addrinarray,
+ ret = change_page_attr_clear(addr, addrinarray,
__pgprot(_PAGE_CACHE_MASK), 1);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < addrinarray; i++)
+ free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
+
+ return 0;
}
EXPORT_SYMBOL(set_memory_array_wb);
@@ -1136,6 +1161,8 @@ int set_pages_array_wb(struct page **pages, int addrinarray)
retval = cpa_clear_pages_array(pages, addrinarray,
__pgprot(_PAGE_CACHE_MASK));
+ if (retval)
+ return retval;
for (i = 0; i < addrinarray; i++) {
start = (unsigned long)page_address(pages[i]);
@@ -1143,7 +1170,7 @@ int set_pages_array_wb(struct page **pages, int addrinarray)
free_memtype(start, end);
}
- return retval;
+ return 0;
}
EXPORT_SYMBOL(set_pages_array_wb);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index c009a241d562..e6718bb28065 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -182,10 +182,10 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
u8 mtrr_type;
mtrr_type = mtrr_type_lookup(start, end);
- if (mtrr_type == MTRR_TYPE_UNCACHABLE)
- return _PAGE_CACHE_UC;
- if (mtrr_type == MTRR_TYPE_WRCOMB)
- return _PAGE_CACHE_WC;
+ if (mtrr_type != MTRR_TYPE_WRBACK)
+ return _PAGE_CACHE_UC_MINUS;
+
+ return _PAGE_CACHE_WB;
}
return req_type;
@@ -352,23 +352,13 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
return 0;
}
- if (req_type == -1) {
- /*
- * Call mtrr_lookup to get the type hint. This is an
- * optimization for /dev/mem mmap'ers into WB memory (BIOS
- * tools and ACPI tools). Use WB request for WB memory and use
- * UC_MINUS otherwise.
- */
- u8 mtrr_type = mtrr_type_lookup(start, end);
-
- if (mtrr_type == MTRR_TYPE_WRBACK)
- actual_type = _PAGE_CACHE_WB;
- else
- actual_type = _PAGE_CACHE_UC_MINUS;
- } else {
- actual_type = pat_x_mtrr_type(start, end,
- req_type & _PAGE_CACHE_MASK);
- }
+ /*
+ * Call mtrr_lookup to get the type hint. This is an
+ * optimization for /dev/mem mmap'ers into WB memory (BIOS
+ * tools and ACPI tools). Use WB request for WB memory and use
+ * UC_MINUS otherwise.
+ */
+ actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK);
if (new_type)
*new_type = actual_type;
@@ -546,9 +536,7 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t *vma_prot)
{
- u64 offset = ((u64) pfn) << PAGE_SHIFT;
- unsigned long flags = -1;
- int retval;
+ unsigned long flags = _PAGE_CACHE_WB;
if (!range_is_allowed(pfn, size))
return 0;
@@ -576,64 +564,11 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
}
#endif
- /*
- * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
- *
- * Without O_SYNC, we want to get
- * - WB for WB-able memory and no other conflicting mappings
- * - UC_MINUS for non-WB-able memory with no other conflicting mappings
- * - Inherit from confliting mappings otherwise
- */
- if (flags != -1) {
- retval = reserve_memtype(offset, offset + size, flags, NULL);
- } else {
- retval = reserve_memtype(offset, offset + size, -1, &flags);
- }
-
- if (retval < 0)
- return 0;
-
- if (((pfn < max_low_pfn_mapped) ||
- (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) &&
- ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
- free_memtype(offset, offset + size);
- printk(KERN_INFO
- "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
- current->comm, current->pid,
- cattr_name(flags),
- offset, (unsigned long long)(offset + size));
- return 0;
- }
-
*vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
flags);
return 1;
}
-void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
-{
- unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
- u64 addr = (u64)pfn << PAGE_SHIFT;
- unsigned long flags;
-
- reserve_memtype(addr, addr + size, want_flags, &flags);
- if (flags != want_flags) {
- printk(KERN_INFO
- "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n",
- current->comm, current->pid,
- cattr_name(want_flags),
- addr, (unsigned long long)(addr + size),
- cattr_name(flags));
- }
-}
-
-void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
-{
- u64 addr = (u64)pfn << PAGE_SHIFT;
-
- free_memtype(addr, addr + size);
-}
-
/*
* Change the memory type for the physial address range in kernel identity
* mapping space if that range is a part of identity map.
@@ -671,8 +606,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
{
int is_ram = 0;
int ret;
- unsigned long flags;
unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK);
+ unsigned long flags = want_flags;
is_ram = pat_pagerange_is_ram(paddr, paddr + size);
@@ -734,29 +669,28 @@ static void free_pfn_range(u64 paddr, unsigned long size)
*
* If the vma has a linear pfn mapping for the entire range, we get the prot
* from pte and reserve the entire vma range with single reserve_pfn_range call.
- * Otherwise, we reserve the entire vma range, my ging through the PTEs page
- * by page to get physical address and protection.
*/
int track_pfn_vma_copy(struct vm_area_struct *vma)
{
- int retval = 0;
- unsigned long i, j;
resource_size_t paddr;
unsigned long prot;
- unsigned long vma_start = vma->vm_start;
- unsigned long vma_end = vma->vm_end;
- unsigned long vma_size = vma_end - vma_start;
+ unsigned long vma_size = vma->vm_end - vma->vm_start;
pgprot_t pgprot;
if (!pat_enabled)
return 0;
+ /*
+ * For now, only handle remap_pfn_range() vmas where
+ * is_linear_pfn_mapping() == TRUE. Handling of
+ * vm_insert_pfn() is TBD.
+ */
if (is_linear_pfn_mapping(vma)) {
/*
* reserve the whole chunk covered by vma. We need the
* starting address and protection from pte.
*/
- if (follow_phys(vma, vma_start, 0, &prot, &paddr)) {
+ if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
WARN_ON_ONCE(1);
return -EINVAL;
}
@@ -764,28 +698,7 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
}
- /* reserve entire vma page by page, using pfn and prot from pte */
- for (i = 0; i < vma_size; i += PAGE_SIZE) {
- if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
- continue;
-
- pgprot = __pgprot(prot);
- retval = reserve_pfn_range(paddr, PAGE_SIZE, &pgprot, 1);
- if (retval)
- goto cleanup_ret;
- }
return 0;
-
-cleanup_ret:
- /* Reserve error: Cleanup partial reservation and return error */
- for (j = 0; j < i; j += PAGE_SIZE) {
- if (follow_phys(vma, vma_start + j, 0, &prot, &paddr))
- continue;
-
- free_pfn_range(paddr, PAGE_SIZE);
- }
-
- return retval;
}
/*
@@ -795,50 +708,28 @@ cleanup_ret:
* prot is passed in as a parameter for the new mapping. If the vma has a
* linear pfn mapping for the entire range reserve the entire vma range with
* single reserve_pfn_range call.
- * Otherwise, we look t the pfn and size and reserve only the specified range
- * page by page.
- *
- * Note that this function can be called with caller trying to map only a
- * subrange/page inside the vma.
*/
int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
unsigned long pfn, unsigned long size)
{
- int retval = 0;
- unsigned long i, j;
- resource_size_t base_paddr;
resource_size_t paddr;
- unsigned long vma_start = vma->vm_start;
- unsigned long vma_end = vma->vm_end;
- unsigned long vma_size = vma_end - vma_start;
+ unsigned long vma_size = vma->vm_end - vma->vm_start;
if (!pat_enabled)
return 0;
+ /*
+ * For now, only handle remap_pfn_range() vmas where
+ * is_linear_pfn_mapping() == TRUE. Handling of
+ * vm_insert_pfn() is TBD.
+ */
if (is_linear_pfn_mapping(vma)) {
/* reserve the whole chunk starting from vm_pgoff */
paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
return reserve_pfn_range(paddr, vma_size, prot, 0);
}
- /* reserve page by page using pfn and size */
- base_paddr = (resource_size_t)pfn << PAGE_SHIFT;
- for (i = 0; i < size; i += PAGE_SIZE) {
- paddr = base_paddr + i;
- retval = reserve_pfn_range(paddr, PAGE_SIZE, prot, 0);
- if (retval)
- goto cleanup_ret;
- }
return 0;
-
-cleanup_ret:
- /* Reserve error: Cleanup partial reservation and return error */
- for (j = 0; j < i; j += PAGE_SIZE) {
- paddr = base_paddr + j;
- free_pfn_range(paddr, PAGE_SIZE);
- }
-
- return retval;
}
/*
@@ -849,39 +740,23 @@ cleanup_ret:
void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
unsigned long size)
{
- unsigned long i;
resource_size_t paddr;
- unsigned long prot;
- unsigned long vma_start = vma->vm_start;
- unsigned long vma_end = vma->vm_end;
- unsigned long vma_size = vma_end - vma_start;
+ unsigned long vma_size = vma->vm_end - vma->vm_start;
if (!pat_enabled)
return;
+ /*
+ * For now, only handle remap_pfn_range() vmas where
+ * is_linear_pfn_mapping() == TRUE. Handling of
+ * vm_insert_pfn() is TBD.
+ */
if (is_linear_pfn_mapping(vma)) {
/* free the whole chunk starting from vm_pgoff */
paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
free_pfn_range(paddr, vma_size);
return;
}
-
- if (size != 0 && size != vma_size) {
- /* free page by page, using pfn and size */
- paddr = (resource_size_t)pfn << PAGE_SHIFT;
- for (i = 0; i < size; i += PAGE_SIZE) {
- paddr = paddr + i;
- free_pfn_range(paddr, PAGE_SIZE);
- }
- } else {
- /* free entire vma, page by page, using the pfn from pte */
- for (i = 0; i < vma_size; i += PAGE_SIZE) {
- if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
- continue;
-
- free_pfn_range(paddr, PAGE_SIZE);
- }
- }
}
pgprot_t pgprot_writecombine(pgprot_t prot)
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index c7d272b8574c..33c5fa57e43d 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -28,6 +28,7 @@ int acpi_numa __initdata;
static struct acpi_table_slit *acpi_slit;
static nodemask_t nodes_parsed __initdata;
+static nodemask_t cpu_nodes_parsed __initdata;
static struct bootnode nodes[MAX_NUMNODES] __initdata;
static struct bootnode nodes_add[MAX_NUMNODES];
static int found_add_area __initdata;
@@ -141,6 +142,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
apic_id = pa->apic_id;
apicid_to_node[apic_id] = node;
+ node_set(node, cpu_nodes_parsed);
acpi_numa = 1;
printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
pxm, apic_id, node);
@@ -174,6 +176,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
else
apic_id = pa->apic_id;
apicid_to_node[apic_id] = node;
+ node_set(node, cpu_nodes_parsed);
acpi_numa = 1;
printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
pxm, apic_id, node);
@@ -402,7 +405,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
return -1;
}
- node_possible_map = nodes_parsed;
+ /* Account for nodes with cpus and no memory */
+ nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
/* Finally register nodes */
for_each_node_mask(i, node_possible_map)
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 9bb09823b362..f893d6a6e803 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -94,12 +94,16 @@ struct pci_root_info {
static int pci_root_num;
static struct pci_root_info pci_root_info[PCI_ROOT_NR];
-void set_pci_bus_resources_arch_default(struct pci_bus *b)
+void x86_pci_root_bus_res_quirks(struct pci_bus *b)
{
int i;
int j;
struct pci_root_info *info;
+ /* don't go for it if _CRS is used */
+ if (pci_probe & PCI_USE__CRS)
+ return;
+
/* if only one root bus, don't need to anything */
if (pci_root_num < 2)
return;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 8c362b96b644..2202b6257b82 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -147,10 +147,13 @@ static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
* are examined.
*/
-void __devinit pcibios_fixup_bus(struct pci_bus *b)
+void __devinit pcibios_fixup_bus(struct pci_bus *b)
{
struct pci_dev *dev;
+ /* root bus? */
+ if (!b->parent)
+ x86_pci_root_bus_res_quirks(b);
pci_read_bridge_bases(b);
list_for_each_entry(dev, &b->devices, bus_list)
pcibios_fixup_device_resources(dev);
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index f1817f71e009..a85bef20a3b9 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -238,6 +238,10 @@ void __init pcibios_resource_survey(void)
*/
fs_initcall(pcibios_assign_resources);
+void __weak x86_pci_root_bus_res_quirks(struct pci_bus *b)
+{
+}
+
/*
* If we set up a device for bus mastering, we need to check the latency
* timer as certain crappy BIOSes forget to set it properly.
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 905bb526b133..5fa10bb9604f 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -375,7 +375,7 @@ static acpi_status __init check_mcfg_resource(struct acpi_resource *res,
if (!fixmem32)
return AE_OK;
if ((mcfg_res->start >= fixmem32->address) &&
- (mcfg_res->end < (fixmem32->address +
+ (mcfg_res->end <= (fixmem32->address +
fixmem32->address_length))) {
mcfg_res->flags = 1;
return AE_CTRL_TERMINATE;
@@ -392,7 +392,7 @@ static acpi_status __init check_mcfg_resource(struct acpi_resource *res,
return AE_OK;
if ((mcfg_res->start >= address.minimum) &&
- (mcfg_res->end < (address.minimum + address.address_length))) {
+ (mcfg_res->end <= (address.minimum + address.address_length))) {
mcfg_res->flags = 1;
return AE_CTRL_TERMINATE;
}
@@ -439,7 +439,7 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,
u64 old_size = size;
int valid = 0;
- while (!is_reserved(addr, addr + size - 1, E820_RESERVED)) {
+ while (!is_reserved(addr, addr + size, E820_RESERVED)) {
size >>= 1;
if (size < (16UL<<20))
break;
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index d9d35824c56f..6a40b78b46aa 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -104,11 +104,13 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
{
long ret;
if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
- BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
- offsetof(struct timespec, tv_nsec) ||
- sizeof(*tv) != sizeof(struct timespec));
- do_realtime((struct timespec *)tv);
- tv->tv_usec /= 1000;
+ if (likely(tv != NULL)) {
+ BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
+ offsetof(struct timespec, tv_nsec) ||
+ sizeof(*tv) != sizeof(struct timespec));
+ do_realtime((struct timespec *)tv);
+ tv->tv_usec /= 1000;
+ }
if (unlikely(tz != NULL)) {
/* Avoid memcpy. Some old compilers fail to inline it */
tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 82cd39a6cbd3..f09e8c36ee80 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -42,6 +42,7 @@
#include <asm/xen/hypervisor.h>
#include <asm/fixmap.h>
#include <asm/processor.h>
+#include <asm/proto.h>
#include <asm/msr-index.h>
#include <asm/setup.h>
#include <asm/desc.h>
@@ -168,21 +169,23 @@ static void __init xen_banner(void)
xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
}
+static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
+static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
+
static void xen_cpuid(unsigned int *ax, unsigned int *bx,
unsigned int *cx, unsigned int *dx)
{
+ unsigned maskecx = ~0;
unsigned maskedx = ~0;
/*
* Mask out inconvenient features, to try and disable as many
* unsupported kernel subsystems as possible.
*/
- if (*ax == 1)
- maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */
- (1 << X86_FEATURE_ACPI) | /* disable ACPI */
- (1 << X86_FEATURE_MCE) | /* disable MCE */
- (1 << X86_FEATURE_MCA) | /* disable MCA */
- (1 << X86_FEATURE_ACC)); /* thermal monitoring */
+ if (*ax == 1) {
+ maskecx = cpuid_leaf1_ecx_mask;
+ maskedx = cpuid_leaf1_edx_mask;
+ }
asm(XEN_EMULATE_PREFIX "cpuid"
: "=a" (*ax),
@@ -190,9 +193,43 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
"=c" (*cx),
"=d" (*dx)
: "0" (*ax), "2" (*cx));
+
+ *cx &= maskecx;
*dx &= maskedx;
}
+static __init void xen_init_cpuid_mask(void)
+{
+ unsigned int ax, bx, cx, dx;
+
+ cpuid_leaf1_edx_mask =
+ ~((1 << X86_FEATURE_MCE) | /* disable MCE */
+ (1 << X86_FEATURE_MCA) | /* disable MCA */
+ (1 << X86_FEATURE_ACC)); /* thermal monitoring */
+
+ if (!xen_initial_domain())
+ cpuid_leaf1_edx_mask &=
+ ~((1 << X86_FEATURE_APIC) | /* disable local APIC */
+ (1 << X86_FEATURE_ACPI)); /* disable ACPI */
+
+ ax = 1;
+ xen_cpuid(&ax, &bx, &cx, &dx);
+
+ /* cpuid claims we support xsave; try enabling it to see what happens */
+ if (cx & (1 << (X86_FEATURE_XSAVE % 32))) {
+ unsigned long cr4;
+
+ set_in_cr4(X86_CR4_OSXSAVE);
+
+ cr4 = read_cr4();
+
+ if ((cr4 & X86_CR4_OSXSAVE) == 0)
+ cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32));
+
+ clear_in_cr4(X86_CR4_OSXSAVE);
+ }
+}
+
static void xen_set_debugreg(int reg, unsigned long val)
{
HYPERVISOR_set_debugreg(reg, val);
@@ -284,12 +321,11 @@ static void xen_set_ldt(const void *addr, unsigned entries)
static void xen_load_gdt(const struct desc_ptr *dtr)
{
- unsigned long *frames;
unsigned long va = dtr->address;
unsigned int size = dtr->size + 1;
unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+ unsigned long frames[pages];
int f;
- struct multicall_space mcs;
/* A GDT can be up to 64k in size, which corresponds to 8192
8-byte entries, or 16 4k pages.. */
@@ -297,19 +333,26 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
BUG_ON(size > 65536);
BUG_ON(va & ~PAGE_MASK);
- mcs = xen_mc_entry(sizeof(*frames) * pages);
- frames = mcs.args;
-
for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
- frames[f] = arbitrary_virt_to_mfn((void *)va);
+ int level;
+ pte_t *ptep = lookup_address(va, &level);
+ unsigned long pfn, mfn;
+ void *virt;
+
+ BUG_ON(ptep == NULL);
+
+ pfn = pte_pfn(*ptep);
+ mfn = pfn_to_mfn(pfn);
+ virt = __va(PFN_PHYS(pfn));
+
+ frames[f] = mfn;
make_lowmem_page_readonly((void *)va);
- make_lowmem_page_readonly(mfn_to_virt(frames[f]));
+ make_lowmem_page_readonly(virt);
}
- MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
-
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+ if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
+ BUG();
}
static void load_TLS_descriptor(struct thread_struct *t,
@@ -385,7 +428,7 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
static int cvt_gate_to_trap(int vector, const gate_desc *val,
struct trap_info *info)
{
- if (val->type != 0xf && val->type != 0xe)
+ if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
return 0;
info->vector = vector;
@@ -393,8 +436,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
info->cs = gate_segment(*val);
info->flags = val->dpl;
/* interrupt gates clear IF */
- if (val->type == 0xe)
- info->flags |= 4;
+ if (val->type == GATE_INTERRUPT)
+ info->flags |= 1 << 2;
return 1;
}
@@ -872,7 +915,6 @@ static const struct machine_ops __initdata xen_machine_ops = {
.emergency_restart = xen_emergency_restart,
};
-
/* First C function to be called on Xen boot */
asmlinkage void __init xen_start_kernel(void)
{
@@ -897,6 +939,8 @@ asmlinkage void __init xen_start_kernel(void)
xen_init_irq_ops();
+ xen_init_cpuid_mask();
+
#ifdef CONFIG_X86_LOCAL_APIC
/*
* set up the basic apic ops.
@@ -938,6 +982,11 @@ asmlinkage void __init xen_start_kernel(void)
if (!xen_initial_domain())
__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
+#ifdef CONFIG_X86_64
+ /* Work out if we support NX */
+ check_efer();
+#endif
+
/* Don't do the full vcpu_info placement stuff until we have a
possible map and a non-dummy shared_info. */
per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 2a81838a9ab7..9842b1212407 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -184,7 +184,7 @@ static inline unsigned p2m_index(unsigned long pfn)
}
/* Build the parallel p2m_top_mfn structures */
-void xen_setup_mfn_list_list(void)
+static void __init xen_build_mfn_list_list(void)
{
unsigned pfn, idx;
@@ -198,7 +198,10 @@ void xen_setup_mfn_list_list(void)
unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
}
+}
+void xen_setup_mfn_list_list(void)
+{
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
@@ -218,6 +221,8 @@ void __init xen_build_dynamic_phys_to_machine(void)
p2m_top[topidx] = &mfn_list[pfn];
}
+
+ xen_build_mfn_list_list();
}
unsigned long get_phys_to_machine(unsigned long pfn)
@@ -233,47 +238,74 @@ unsigned long get_phys_to_machine(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(get_phys_to_machine);
-static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
+/* install a new p2m_top page */
+bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
{
- unsigned long *p;
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned long **pfnp, *mfnp;
unsigned i;
- p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
- BUG_ON(p == NULL);
+ pfnp = &p2m_top[topidx];
+ mfnp = &p2m_top_mfn[topidx];
for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
p[i] = INVALID_P2M_ENTRY;
- if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
- free_page((unsigned long)p);
- else
+ if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
*mfnp = virt_to_mfn(p);
+ return true;
+ }
+
+ return false;
}
-void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+static void alloc_p2m(unsigned long pfn)
{
- unsigned topidx, idx;
+ unsigned long *p;
- if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
- BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
- return;
- }
+ p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
+ BUG_ON(p == NULL);
+
+ if (!install_p2mtop_page(pfn, p))
+ free_page((unsigned long)p);
+}
+
+/* Try to install p2m mapping; fail if intermediate bits missing */
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ unsigned topidx, idx;
if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
BUG_ON(mfn != INVALID_P2M_ENTRY);
- return;
+ return true;
}
topidx = p2m_top_index(pfn);
if (p2m_top[topidx] == p2m_missing) {
- /* no need to allocate a page to store an invalid entry */
if (mfn == INVALID_P2M_ENTRY)
- return;
- alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
+ return true;
+ return false;
}
idx = p2m_index(pfn);
p2m_top[topidx][idx] = mfn;
+
+ return true;
+}
+
+void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+ BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+ return;
+ }
+
+ if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
+ alloc_p2m(pfn);
+
+ if (!__set_phys_to_machine(pfn, mfn))
+ BUG();
+ }
}
unsigned long arbitrary_virt_to_mfn(void *vaddr)
@@ -987,7 +1019,7 @@ static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
return 0;
}
-void __init xen_mark_init_mm_pinned(void)
+static void __init xen_mark_init_mm_pinned(void)
{
xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
}
@@ -1270,8 +1302,8 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
} *args;
struct multicall_space mcs;
- BUG_ON(cpumask_empty(cpus));
- BUG_ON(!mm);
+ if (cpumask_empty(cpus))
+ return; /* nothing to do */
mcs = xen_mc_entry(sizeof(*args));
args = mcs.args;
@@ -1438,6 +1470,15 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
}
#endif
+static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+ struct mmuext_op op;
+ op.cmd = cmd;
+ op.arg1.mfn = pfn_to_mfn(pfn);
+ if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+ BUG();
+}
+
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
@@ -1446,22 +1487,29 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
BUG_ON(mem_map); /* should only be used early */
#endif
make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+}
+
+/* Used for pmd and pud */
+static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
+{
+#ifdef CONFIG_FLATMEM
+ BUG_ON(mem_map); /* should only be used early */
+#endif
+ make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
}
/* Early release_pte assumes that all pts are pinned, since there's
only init_mm and anything attached to that is pinned. */
-static void xen_release_pte_init(unsigned long pfn)
+static __init void xen_release_pte_init(unsigned long pfn)
{
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
-static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+static __init void xen_release_pmd_init(unsigned long pfn)
{
- struct mmuext_op op;
- op.cmd = cmd;
- op.arg1.mfn = pfn_to_mfn(pfn);
- if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
- BUG();
+ make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
/* This needs to make sure the new pte page is pinned iff its being
@@ -1773,6 +1821,9 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#ifdef CONFIG_X86_LOCAL_APIC
case FIX_APIC_BASE: /* maps dummy local APIC */
#endif
+ case FIX_TEXT_POKE0:
+ case FIX_TEXT_POKE1:
+ /* All local page mappings */
pte = pfn_pte(phys, prot);
break;
@@ -1819,7 +1870,6 @@ __init void xen_post_allocator_init(void)
xen_mark_init_mm_pinned();
}
-
const struct pv_mmu_ops xen_mmu_ops __initdata = {
.pagetable_setup_start = xen_pagetable_setup_start,
.pagetable_setup_done = xen_pagetable_setup_done,
@@ -1843,9 +1893,9 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
.alloc_pte = xen_alloc_pte_init,
.release_pte = xen_release_pte_init,
- .alloc_pmd = xen_alloc_pte_init,
+ .alloc_pmd = xen_alloc_pmd_init,
.alloc_pmd_clone = paravirt_nop,
- .release_pmd = xen_release_pte_init,
+ .release_pmd = xen_release_pmd_init,
#ifdef CONFIG_HIGHPTE
.kmap_atomic_pte = xen_kmap_atomic_pte,
@@ -1883,8 +1933,8 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
.make_pud = PV_CALLEE_SAVE(xen_make_pud),
.set_pgd = xen_set_pgd_hyper,
- .alloc_pud = xen_alloc_pte_init,
- .release_pud = xen_release_pte_init,
+ .alloc_pud = xen_alloc_pmd_init,
+ .release_pud = xen_release_pmd_init,
#endif /* PAGETABLE_LEVELS == 4 */
.activate_mm = xen_activate_mm,
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 24d1b44a337d..da7302624897 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -11,6 +11,9 @@ enum pt_level {
};
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+bool install_p2mtop_page(unsigned long pfn, unsigned long *p);
+
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 585a6e330837..429834ec1687 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -317,7 +317,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
BUG_ON(rc);
while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
- HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+ HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
barrier();
}
@@ -422,7 +422,7 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
/* Make sure other vcpus get a chance to run if they need to. */
for_each_cpu(cpu, mask) {
if (xen_vcpu_stolen(cpu)) {
- HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+ HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
break;
}
}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 14f240623497..0a5aa44299a5 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -213,6 +213,11 @@ cycle_t xen_clocksource_read(void)
return ret;
}
+static cycle_t xen_clocksource_get_cycles(struct clocksource *cs)
+{
+ return xen_clocksource_read();
+}
+
static void xen_read_wallclock(struct timespec *ts)
{
struct shared_info *s = HYPERVISOR_shared_info;
@@ -241,7 +246,7 @@ int xen_set_wallclock(unsigned long now)
static struct clocksource xen_clocksource __read_mostly = {
.name = "xen",
.rating = 400,
- .read = xen_clocksource_read,
+ .read = xen_clocksource_get_cycles,
.mask = ~0,
.mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
.shift = XEN_SHIFT,
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 2f5ef2632ea2..20139464943c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -57,8 +57,6 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
bool xen_vcpu_stolen(int vcpu);
-void xen_mark_init_mm_pinned(void);
-
void xen_setup_vcpu_info_placement(void);
#ifdef CONFIG_SMP