summaryrefslogtreecommitdiff
path: root/Documentation/virtual
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/virtual')
-rw-r--r--Documentation/virtual/kvm/00-INDEX2
-rw-r--r--Documentation/virtual/kvm/api.txt289
-rw-r--r--Documentation/virtual/kvm/devices/arm-vgic-v3.txt11
-rw-r--r--Documentation/virtual/kvm/devices/arm-vgic.txt6
-rw-r--r--Documentation/virtual/kvm/halt-polling.txt127
-rw-r--r--Documentation/virtual/kvm/hypercalls.txt35
-rw-r--r--Documentation/virtual/kvm/locking.txt39
-rw-r--r--Documentation/virtual/kvm/msr.txt9
-rw-r--r--Documentation/virtual/kvm/review-checklist.txt4
-rw-r--r--Documentation/virtual/uml/UserModeLinux-HOWTO.txt6
10 files changed, 504 insertions, 24 deletions
diff --git a/Documentation/virtual/kvm/00-INDEX b/Documentation/virtual/kvm/00-INDEX
index fee9f2bf9c64..69fe1a8b7ad1 100644
--- a/Documentation/virtual/kvm/00-INDEX
+++ b/Documentation/virtual/kvm/00-INDEX
@@ -6,6 +6,8 @@ cpuid.txt
- KVM-specific cpuid leaves (x86).
devices/
- KVM_CAP_DEVICE_CTRL userspace API.
+halt-polling.txt
+ - notes on halt-polling
hypercalls.txt
- KVM hypercalls.
locking.txt
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 6bbceb9a3a19..fd106899afd1 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -951,6 +951,10 @@ This ioctl allows the user to create or modify a guest physical memory
slot. When changing an existing slot, it may be moved in the guest
physical memory space, or its flags may be modified. It may not be
resized. Slots may not overlap in guest physical address space.
+Bits 0-15 of "slot" specifies the slot id and this value should be
+less than the maximum number of user memory slots supported per VM.
+The maximum allowed slots can be queried using KVM_CAP_NR_MEMSLOTS,
+if this capability is supported by the architecture.
If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 of "slot"
specifies the address space which is being modified. They must be
@@ -2034,6 +2038,8 @@ registers, find a list below:
PPC | KVM_REG_PPC_WORT | 64
PPC | KVM_REG_PPC_SPRG9 | 64
PPC | KVM_REG_PPC_DBSR | 32
+ PPC | KVM_REG_PPC_TIDR | 64
+ PPC | KVM_REG_PPC_PSSCR | 64
PPC | KVM_REG_PPC_TM_GPR0 | 64
...
PPC | KVM_REG_PPC_TM_GPR31 | 64
@@ -2050,6 +2056,7 @@ registers, find a list below:
PPC | KVM_REG_PPC_TM_VSCR | 32
PPC | KVM_REG_PPC_TM_DSCR | 64
PPC | KVM_REG_PPC_TM_TAR | 64
+ PPC | KVM_REG_PPC_TM_XER | 64
| |
MIPS | KVM_REG_MIPS_R0 | 64
...
@@ -2058,6 +2065,8 @@ registers, find a list below:
MIPS | KVM_REG_MIPS_LO | 64
MIPS | KVM_REG_MIPS_PC | 64
MIPS | KVM_REG_MIPS_CP0_INDEX | 32
+ MIPS | KVM_REG_MIPS_CP0_ENTRYLO0 | 64
+ MIPS | KVM_REG_MIPS_CP0_ENTRYLO1 | 64
MIPS | KVM_REG_MIPS_CP0_CONTEXT | 64
MIPS | KVM_REG_MIPS_CP0_USERLOCAL | 64
MIPS | KVM_REG_MIPS_CP0_PAGEMASK | 32
@@ -2068,9 +2077,11 @@ registers, find a list below:
MIPS | KVM_REG_MIPS_CP0_ENTRYHI | 64
MIPS | KVM_REG_MIPS_CP0_COMPARE | 32
MIPS | KVM_REG_MIPS_CP0_STATUS | 32
+ MIPS | KVM_REG_MIPS_CP0_INTCTL | 32
MIPS | KVM_REG_MIPS_CP0_CAUSE | 32
MIPS | KVM_REG_MIPS_CP0_EPC | 64
MIPS | KVM_REG_MIPS_CP0_PRID | 32
+ MIPS | KVM_REG_MIPS_CP0_EBASE | 64
MIPS | KVM_REG_MIPS_CP0_CONFIG | 32
MIPS | KVM_REG_MIPS_CP0_CONFIG1 | 32
MIPS | KVM_REG_MIPS_CP0_CONFIG2 | 32
@@ -2145,6 +2156,12 @@ patterns depending on whether they're 32-bit or 64-bit registers:
0x7020 0000 0001 00 <reg:5> <sel:3> (32-bit)
0x7030 0000 0001 00 <reg:5> <sel:3> (64-bit)
+Note: KVM_REG_MIPS_CP0_ENTRYLO0 and KVM_REG_MIPS_CP0_ENTRYLO1 are the MIPS64
+versions of the EntryLo registers regardless of the word size of the host
+hardware, host kernel, guest, and whether XPA is present in the guest, i.e.
+with the RI and XI bits (if they exist) in bits 63 and 62 respectively, and
+the PFNX field starting at bit 30.
+
MIPS KVM control registers (see above) have the following id bit patterns:
0x7030 0000 0002 <reg:16>
@@ -2209,7 +2226,7 @@ after pausing the vcpu, but before it is resumed.
4.71 KVM_SIGNAL_MSI
Capability: KVM_CAP_SIGNAL_MSI
-Architectures: x86 arm64
+Architectures: x86 arm arm64
Type: vm ioctl
Parameters: struct kvm_msi (in)
Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error
@@ -2440,18 +2457,20 @@ are, it will do nothing and return an EBUSY error.
The parameter is a pointer to a 32-bit unsigned integer variable
containing the order (log base 2) of the desired size of the hash
table, which must be between 18 and 46. On successful return from the
-ioctl, it will have been updated with the order of the hash table that
-was allocated.
+ioctl, the value will not be changed by the kernel.
If no hash table has been allocated when any vcpu is asked to run
(with the KVM_RUN ioctl), the host kernel will allocate a
default-sized hash table (16 MB).
If this ioctl is called when a hash table has already been allocated,
-the kernel will clear out the existing hash table (zero all HPTEs) and
-return the hash table order in the parameter. (If the guest is using
-the virtualized real-mode area (VRMA) facility, the kernel will
-re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
+with a different order from the existing hash table, the existing hash
+table will be freed and a new one allocated. If this is ioctl is
+called when a hash table has already been allocated of the same order
+as specified, the kernel will clear out the existing hash table (zero
+all HPTEs). In either case, if the guest is using the virtualized
+real-mode area (VRMA) facility, the kernel will re-create the VMRA
+HPTEs on the next KVM_RUN of any vcpu.
4.77 KVM_S390_INTERRUPT
@@ -3174,7 +3193,7 @@ of IOMMU pages.
The rest of functionality is identical to KVM_CREATE_SPAPR_TCE.
-4.98 KVM_REINJECT_CONTROL
+4.99 KVM_REINJECT_CONTROL
Capability: KVM_CAP_REINJECT_CONTROL
Architectures: x86
@@ -3198,6 +3217,229 @@ struct kvm_reinject_control {
pit_reinject = 0 (!reinject mode) is recommended, unless running an old
operating system that uses the PIT for timing (e.g. Linux 2.4.x).
+4.100 KVM_PPC_CONFIGURE_V3_MMU
+
+Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_ppc_mmuv3_cfg (in)
+Returns: 0 on success,
+ -EFAULT if struct kvm_ppc_mmuv3_cfg cannot be read,
+ -EINVAL if the configuration is invalid
+
+This ioctl controls whether the guest will use radix or HPT (hashed
+page table) translation, and sets the pointer to the process table for
+the guest.
+
+struct kvm_ppc_mmuv3_cfg {
+ __u64 flags;
+ __u64 process_table;
+};
+
+There are two bits that can be set in flags; KVM_PPC_MMUV3_RADIX and
+KVM_PPC_MMUV3_GTSE. KVM_PPC_MMUV3_RADIX, if set, configures the guest
+to use radix tree translation, and if clear, to use HPT translation.
+KVM_PPC_MMUV3_GTSE, if set and if KVM permits it, configures the guest
+to be able to use the global TLB and SLB invalidation instructions;
+if clear, the guest may not use these instructions.
+
+The process_table field specifies the address and size of the guest
+process table, which is in the guest's space. This field is formatted
+as the second doubleword of the partition table entry, as defined in
+the Power ISA V3.00, Book III section 5.7.6.1.
+
+4.101 KVM_PPC_GET_RMMU_INFO
+
+Capability: KVM_CAP_PPC_RADIX_MMU
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_ppc_rmmu_info (out)
+Returns: 0 on success,
+ -EFAULT if struct kvm_ppc_rmmu_info cannot be written,
+ -EINVAL if no useful information can be returned
+
+This ioctl returns a structure containing two things: (a) a list
+containing supported radix tree geometries, and (b) a list that maps
+page sizes to put in the "AP" (actual page size) field for the tlbie
+(TLB invalidate entry) instruction.
+
+struct kvm_ppc_rmmu_info {
+ struct kvm_ppc_radix_geom {
+ __u8 page_shift;
+ __u8 level_bits[4];
+ __u8 pad[3];
+ } geometries[8];
+ __u32 ap_encodings[8];
+};
+
+The geometries[] field gives up to 8 supported geometries for the
+radix page table, in terms of the log base 2 of the smallest page
+size, and the number of bits indexed at each level of the tree, from
+the PTE level up to the PGD level in that order. Any unused entries
+will have 0 in the page_shift field.
+
+The ap_encodings gives the supported page sizes and their AP field
+encodings, encoded with the AP value in the top 3 bits and the log
+base 2 of the page size in the bottom 6 bits.
+
+4.102 KVM_PPC_RESIZE_HPT_PREPARE
+
+Capability: KVM_CAP_SPAPR_RESIZE_HPT
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_ppc_resize_hpt (in)
+Returns: 0 on successful completion,
+ >0 if a new HPT is being prepared, the value is an estimated
+ number of milliseconds until preparation is complete
+ -EFAULT if struct kvm_reinject_control cannot be read,
+ -EINVAL if the supplied shift or flags are invalid
+ -ENOMEM if unable to allocate the new HPT
+ -ENOSPC if there was a hash collision when moving existing
+ HPT entries to the new HPT
+ -EIO on other error conditions
+
+Used to implement the PAPR extension for runtime resizing of a guest's
+Hashed Page Table (HPT). Specifically this starts, stops or monitors
+the preparation of a new potential HPT for the guest, essentially
+implementing the H_RESIZE_HPT_PREPARE hypercall.
+
+If called with shift > 0 when there is no pending HPT for the guest,
+this begins preparation of a new pending HPT of size 2^(shift) bytes.
+It then returns a positive integer with the estimated number of
+milliseconds until preparation is complete.
+
+If called when there is a pending HPT whose size does not match that
+requested in the parameters, discards the existing pending HPT and
+creates a new one as above.
+
+If called when there is a pending HPT of the size requested, will:
+ * If preparation of the pending HPT is already complete, return 0
+ * If preparation of the pending HPT has failed, return an error
+ code, then discard the pending HPT.
+ * If preparation of the pending HPT is still in progress, return an
+ estimated number of milliseconds until preparation is complete.
+
+If called with shift == 0, discards any currently pending HPT and
+returns 0 (i.e. cancels any in-progress preparation).
+
+flags is reserved for future expansion, currently setting any bits in
+flags will result in an -EINVAL.
+
+Normally this will be called repeatedly with the same parameters until
+it returns <= 0. The first call will initiate preparation, subsequent
+ones will monitor preparation until it completes or fails.
+
+struct kvm_ppc_resize_hpt {
+ __u64 flags;
+ __u32 shift;
+ __u32 pad;
+};
+
+4.103 KVM_PPC_RESIZE_HPT_COMMIT
+
+Capability: KVM_CAP_SPAPR_RESIZE_HPT
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_ppc_resize_hpt (in)
+Returns: 0 on successful completion,
+ -EFAULT if struct kvm_reinject_control cannot be read,
+ -EINVAL if the supplied shift or flags are invalid
+ -ENXIO is there is no pending HPT, or the pending HPT doesn't
+ have the requested size
+ -EBUSY if the pending HPT is not fully prepared
+ -ENOSPC if there was a hash collision when moving existing
+ HPT entries to the new HPT
+ -EIO on other error conditions
+
+Used to implement the PAPR extension for runtime resizing of a guest's
+Hashed Page Table (HPT). Specifically this requests that the guest be
+transferred to working with the new HPT, essentially implementing the
+H_RESIZE_HPT_COMMIT hypercall.
+
+This should only be called after KVM_PPC_RESIZE_HPT_PREPARE has
+returned 0 with the same parameters. In other cases
+KVM_PPC_RESIZE_HPT_COMMIT will return an error (usually -ENXIO or
+-EBUSY, though others may be possible if the preparation was started,
+but failed).
+
+This will have undefined effects on the guest if it has not already
+placed itself in a quiescent state where no vcpu will make MMU enabled
+memory accesses.
+
+On succsful completion, the pending HPT will become the guest's active
+HPT and the previous HPT will be discarded.
+
+On failure, the guest will still be operating on its previous HPT.
+
+struct kvm_ppc_resize_hpt {
+ __u64 flags;
+ __u32 shift;
+ __u32 pad;
+};
+
+4.104 KVM_X86_GET_MCE_CAP_SUPPORTED
+
+Capability: KVM_CAP_MCE
+Architectures: x86
+Type: system ioctl
+Parameters: u64 mce_cap (out)
+Returns: 0 on success, -1 on error
+
+Returns supported MCE capabilities. The u64 mce_cap parameter
+has the same format as the MSR_IA32_MCG_CAP register. Supported
+capabilities will have the corresponding bits set.
+
+4.105 KVM_X86_SETUP_MCE
+
+Capability: KVM_CAP_MCE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: u64 mcg_cap (in)
+Returns: 0 on success,
+ -EFAULT if u64 mcg_cap cannot be read,
+ -EINVAL if the requested number of banks is invalid,
+ -EINVAL if requested MCE capability is not supported.
+
+Initializes MCE support for use. The u64 mcg_cap parameter
+has the same format as the MSR_IA32_MCG_CAP register and
+specifies which capabilities should be enabled. The maximum
+supported number of error-reporting banks can be retrieved when
+checking for KVM_CAP_MCE. The supported capabilities can be
+retrieved with KVM_X86_GET_MCE_CAP_SUPPORTED.
+
+4.106 KVM_X86_SET_MCE
+
+Capability: KVM_CAP_MCE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_x86_mce (in)
+Returns: 0 on success,
+ -EFAULT if struct kvm_x86_mce cannot be read,
+ -EINVAL if the bank number is invalid,
+ -EINVAL if VAL bit is not set in status field.
+
+Inject a machine check error (MCE) into the guest. The input
+parameter is:
+
+struct kvm_x86_mce {
+ __u64 status;
+ __u64 addr;
+ __u64 misc;
+ __u64 mcg_status;
+ __u8 bank;
+ __u8 pad1[7];
+ __u64 pad2[3];
+};
+
+If the MCE being reported is an uncorrected error, KVM will
+inject it as an MCE exception into the guest. If the guest
+MCG_STATUS register reports that an MCE is in progress, KVM
+causes an KVM_EXIT_SHUTDOWN vmexit.
+
+Otherwise, if the MCE is a corrected error, KVM will just
+store it in the corresponding bank (provided this bank is
+not holding a previously reported uncorrected error).
+
5. The kvm_run structure
------------------------
@@ -3214,7 +3456,18 @@ struct kvm_run {
Request that KVM_RUN return when it becomes possible to inject external
interrupts into the guest. Useful in conjunction with KVM_INTERRUPT.
- __u8 padding1[7];
+ __u8 immediate_exit;
+
+This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN
+exits immediately, returning -EINTR. In the common scenario where a
+signal is used to "kick" a VCPU out of KVM_RUN, this field can be used
+to avoid usage of KVM_SET_SIGNAL_MASK, which has worse scalability.
+Rather than blocking the signal outside KVM_RUN, userspace can set up
+a signal handler that sets run->immediate_exit to a non-zero value.
+
+This field is ignored if KVM_CAP_IMMEDIATE_EXIT is not available.
+
+ __u8 padding1[6];
/* out */
__u32 exit_reason;
@@ -3939,3 +4192,21 @@ In order to use SynIC, it has to be activated by setting this
capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this
will disable the use of APIC hardware virtualization even if supported
by the CPU, as it's incompatible with SynIC auto-EOI behavior.
+
+8.3 KVM_CAP_PPC_RADIX_MMU
+
+Architectures: ppc
+
+This capability, if KVM_CHECK_EXTENSION indicates that it is
+available, means that that the kernel can support guests using the
+radix MMU defined in Power ISA V3.00 (as implemented in the POWER9
+processor).
+
+8.4 KVM_CAP_PPC_HASH_MMU_V3
+
+Architectures: ppc
+
+This capability, if KVM_CHECK_EXTENSION indicates that it is
+available, means that that the kernel can support guests using the
+hashed page table MMU defined in Power ISA V3.00 (as implemented in
+the POWER9 processor), including in-memory segment tables.
diff --git a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt
index 9348b3caccd7..c1a24612c198 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt
@@ -118,7 +118,7 @@ Groups:
-EBUSY: One or more VCPUs are running
- KVM_DEV_ARM_VGIC_CPU_SYSREGS
+ KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS
Attributes:
The attr field of kvm_device_attr encodes two values:
bits: | 63 .... 32 | 31 .... 16 | 15 .... 0 |
@@ -139,13 +139,15 @@ Groups:
All system regs accessed through this API are (rw, 64-bit) and
kvm_device_attr.addr points to a __u64 value.
- KVM_DEV_ARM_VGIC_CPU_SYSREGS accesses the CPU interface registers for the
+ KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS accesses the CPU interface registers for the
CPU specified by the mpidr field.
+ CPU interface registers access is not implemented for AArch32 mode.
+ Error -ENXIO is returned when accessed in AArch32 mode.
Errors:
-ENXIO: Getting or setting this register is not yet supported
-EBUSY: VCPU is running
- -EINVAL: Invalid mpidr supplied
+ -EINVAL: Invalid mpidr or register value supplied
KVM_DEV_ARM_VGIC_GRP_NR_IRQS
@@ -204,3 +206,6 @@ Groups:
architecture defined MPIDR, and the field is encoded as follows:
| 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 |
| Aff3 | Aff2 | Aff1 | Aff0 |
+ Errors:
+ -EINVAL: vINTID is not multiple of 32 or
+ info field is not VGIC_LEVEL_INFO_LINE_LEVEL
diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt
index 76e61c883347..b2f60ca8b60c 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
@@ -83,6 +83,12 @@ Groups:
Bits for undefined preemption levels are RAZ/WI.
+ For historical reasons and to provide ABI compatibility with userspace we
+ export the GICC_PMR register in the format of the GICH_VMCR.VMPriMask
+ field in the lower 5 bits of a word, meaning that userspace must always
+ use the lower 5 bits to communicate with the KVM device and must shift the
+ value left by 3 places to obtain the actual priority mask level.
+
Limitations:
- Priorities are not implemented, and registers are RAZ/WI
- Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2.
diff --git a/Documentation/virtual/kvm/halt-polling.txt b/Documentation/virtual/kvm/halt-polling.txt
new file mode 100644
index 000000000000..4a8418318769
--- /dev/null
+++ b/Documentation/virtual/kvm/halt-polling.txt
@@ -0,0 +1,127 @@
+The KVM halt polling system
+===========================
+
+The KVM halt polling system provides a feature within KVM whereby the latency
+of a guest can, under some circumstances, be reduced by polling in the host
+for some time period after the guest has elected to no longer run by cedeing.
+That is, when a guest vcpu has ceded, or in the case of powerpc when all of the
+vcpus of a single vcore have ceded, the host kernel polls for wakeup conditions
+before giving up the cpu to the scheduler in order to let something else run.
+
+Polling provides a latency advantage in cases where the guest can be run again
+very quickly by at least saving us a trip through the scheduler, normally on
+the order of a few micro-seconds, although performance benefits are workload
+dependant. In the event that no wakeup source arrives during the polling
+interval or some other task on the runqueue is runnable the scheduler is
+invoked. Thus halt polling is especially useful on workloads with very short
+wakeup periods where the time spent halt polling is minimised and the time
+savings of not invoking the scheduler are distinguishable.
+
+The generic halt polling code is implemented in:
+
+ virt/kvm/kvm_main.c: kvm_vcpu_block()
+
+The powerpc kvm-hv specific case is implemented in:
+
+ arch/powerpc/kvm/book3s_hv.c: kvmppc_vcore_blocked()
+
+Halt Polling Interval
+=====================
+
+The maximum time for which to poll before invoking the scheduler, referred to
+as the halt polling interval, is increased and decreased based on the perceived
+effectiveness of the polling in an attempt to limit pointless polling.
+This value is stored in either the vcpu struct:
+
+ kvm_vcpu->halt_poll_ns
+
+or in the case of powerpc kvm-hv, in the vcore struct:
+
+ kvmppc_vcore->halt_poll_ns
+
+Thus this is a per vcpu (or vcore) value.
+
+During polling if a wakeup source is received within the halt polling interval,
+the interval is left unchanged. In the event that a wakeup source isn't
+received during the polling interval (and thus schedule is invoked) there are
+two options, either the polling interval and total block time[0] were less than
+the global max polling interval (see module params below), or the total block
+time was greater than the global max polling interval.
+
+In the event that both the polling interval and total block time were less than
+the global max polling interval then the polling interval can be increased in
+the hope that next time during the longer polling interval the wake up source
+will be received while the host is polling and the latency benefits will be
+received. The polling interval is grown in the function grow_halt_poll_ns() and
+is multiplied by the module parameter halt_poll_ns_grow.
+
+In the event that the total block time was greater than the global max polling
+interval then the host will never poll for long enough (limited by the global
+max) to wakeup during the polling interval so it may as well be shrunk in order
+to avoid pointless polling. The polling interval is shrunk in the function
+shrink_halt_poll_ns() and is divided by the module parameter
+halt_poll_ns_shrink, or set to 0 iff halt_poll_ns_shrink == 0.
+
+It is worth noting that this adjustment process attempts to hone in on some
+steady state polling interval but will only really do a good job for wakeups
+which come at an approximately constant rate, otherwise there will be constant
+adjustment of the polling interval.
+
+[0] total block time: the time between when the halt polling function is
+ invoked and a wakeup source received (irrespective of
+ whether the scheduler is invoked within that function).
+
+Module Parameters
+=================
+
+The kvm module has 3 tuneable module parameters to adjust the global max
+polling interval as well as the rate at which the polling interval is grown and
+shrunk. These variables are defined in include/linux/kvm_host.h and as module
+parameters in virt/kvm/kvm_main.c, or arch/powerpc/kvm/book3s_hv.c in the
+powerpc kvm-hv case.
+
+Module Parameter | Description | Default Value
+--------------------------------------------------------------------------------
+halt_poll_ns | The global max polling interval | KVM_HALT_POLL_NS_DEFAULT
+ | which defines the ceiling value |
+ | of the polling interval for | (per arch value)
+ | each vcpu. |
+--------------------------------------------------------------------------------
+halt_poll_ns_grow | The value by which the halt | 2
+ | polling interval is multiplied |
+ | in the grow_halt_poll_ns() |
+ | function. |
+--------------------------------------------------------------------------------
+halt_poll_ns_shrink | The value by which the halt | 0
+ | polling interval is divided in |
+ | the shrink_halt_poll_ns() |
+ | function. |
+--------------------------------------------------------------------------------
+
+These module parameters can be set from the debugfs files in:
+
+ /sys/module/kvm/parameters/
+
+Note: that these module parameters are system wide values and are not able to
+ be tuned on a per vm basis.
+
+Further Notes
+=============
+
+- Care should be taken when setting the halt_poll_ns module parameter as a
+large value has the potential to drive the cpu usage to 100% on a machine which
+would be almost entirely idle otherwise. This is because even if a guest has
+wakeups during which very little work is done and which are quite far apart, if
+the period is shorter than the global max polling interval (halt_poll_ns) then
+the host will always poll for the entire block time and thus cpu utilisation
+will go to 100%.
+
+- Halt polling essentially presents a trade off between power usage and latency
+and the module parameters should be used to tune the affinity for this. Idle
+cpu time is essentially converted to host kernel time with the aim of decreasing
+latency when entering the guest.
+
+- Halt polling will only be conducted by the host when no other tasks are
+runnable on that cpu, otherwise the polling will cease immediately and
+schedule will be invoked to allow that other task to run. Thus this doesn't
+allow a guest to denial of service the cpu.
diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt
index c8d040e27046..feaaa634f154 100644
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -81,3 +81,38 @@ the vcpu to sleep until occurrence of an appropriate event. Another vcpu of the
same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall,
specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0)
is used in the hypercall for future use.
+
+
+6. KVM_HC_CLOCK_PAIRING
+------------------------
+Architecture: x86
+Status: active
+Purpose: Hypercall used to synchronize host and guest clocks.
+Usage:
+
+a0: guest physical address where host copies
+"struct kvm_clock_offset" structure.
+
+a1: clock_type, ATM only KVM_CLOCK_PAIRING_WALLCLOCK (0)
+is supported (corresponding to the host's CLOCK_REALTIME clock).
+
+ struct kvm_clock_pairing {
+ __s64 sec;
+ __s64 nsec;
+ __u64 tsc;
+ __u32 flags;
+ __u32 pad[9];
+ };
+
+ Where:
+ * sec: seconds from clock_type clock.
+ * nsec: nanoseconds from clock_type clock.
+ * tsc: guest TSC value used to calculate sec/nsec pair
+ * flags: flags, unused (0) at the moment.
+
+The hypercall lets a guest compute a precise timestamp across
+host and guest. The guest can use the returned TSC value to
+compute the CLOCK_REALTIME for its clock, at the same instant.
+
+Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource,
+or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK.
diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index e5dd9f4d6100..1bb8bcaf8497 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -13,8 +13,12 @@ The acquisition orders for mutexes are as follows:
- kvm->slots_lock is taken outside kvm->irq_lock, though acquiring
them together is quite rare.
-For spinlocks, kvm_lock is taken outside kvm->mmu_lock. Everything
-else is a leaf: no other lock is taken inside the critical sections.
+On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock.
+
+For spinlocks, kvm_lock is taken outside kvm->mmu_lock.
+
+Everything else is a leaf: no other lock is taken inside the critical
+sections.
2: Exception
------------
@@ -22,9 +26,16 @@ else is a leaf: no other lock is taken inside the critical sections.
Fast page fault:
Fast page fault is the fast path which fixes the guest page fault out of
-the mmu-lock on x86. Currently, the page fault can be fast only if the
-shadow page table is present and it is caused by write-protect, that means
-we just need change the W bit of the spte.
+the mmu-lock on x86. Currently, the page fault can be fast in one of the
+following two cases:
+
+1. Access Tracking: The SPTE is not present, but it is marked for access
+tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to
+restore the saved R/X bits. This is described in more detail later below.
+
+2. Write-Protection: The SPTE is present and the fault is
+caused by write-protect. That means we just need to change the W bit of the
+spte.
What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and
SPTE_MMU_WRITEABLE bit on the spte:
@@ -34,7 +45,8 @@ SPTE_MMU_WRITEABLE bit on the spte:
page write-protection.
On fast page fault path, we will use cmpxchg to atomically set the spte W
-bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, this
+bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or
+restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This
is safe because whenever changing these bits can be detected by cmpxchg.
But we need carefully check these cases:
@@ -138,6 +150,21 @@ Since the spte is "volatile" if it can be updated out of mmu-lock, we always
atomically update the spte, the race caused by fast page fault can be avoided,
See the comments in spte_has_volatile_bits() and mmu_spte_update().
+Lockless Access Tracking:
+
+This is used for Intel CPUs that are using EPT but do not support the EPT A/D
+bits. In this case, when the KVM MMU notifier is called to track accesses to a
+page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present
+by clearing the RWX bits in the PTE and storing the original R & X bits in
+some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the
+PTE (using the ignored bit 62). When the VM tries to access the page later on,
+a fault is generated and the fast page fault mechanism described above is used
+to atomically restore the PTE to a Present state. The W bit is not saved when
+the PTE is marked for access tracking and during restoration to the Present
+state, the W bit is set depending on whether or not it was a write access. If
+it wasn't, then the W bit will remain clear until a write access happens, at
+which time it will be set using the Dirty tracking mechanism described above.
+
3. Reference
------------
diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt
index 2a71c8f29f68..0a9ea515512a 100644
--- a/Documentation/virtual/kvm/msr.txt
+++ b/Documentation/virtual/kvm/msr.txt
@@ -208,7 +208,9 @@ MSR_KVM_STEAL_TIME: 0x4b564d03
__u64 steal;
__u32 version;
__u32 flags;
- __u32 pad[12];
+ __u8 preempted;
+ __u8 u8_pad[3];
+ __u32 pad[11];
}
whose data will be filled in by the hypervisor periodically. Only one
@@ -232,6 +234,11 @@ MSR_KVM_STEAL_TIME: 0x4b564d03
nanoseconds. Time during which the vcpu is idle, will not be
reported as steal time.
+ preempted: indicate the vCPU who owns this struct is running or
+ not. Non-zero values mean the vCPU has been preempted. Zero
+ means the vCPU is not preempted. NOTE, it is always zero if the
+ the hypervisor doesn't support this field.
+
MSR_KVM_EOI_EN: 0x4b564d04
data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0
when disabled. Bit 1 is reserved and must be zero. When PV end of
diff --git a/Documentation/virtual/kvm/review-checklist.txt b/Documentation/virtual/kvm/review-checklist.txt
index a850986ed684..a83b27635fdd 100644
--- a/Documentation/virtual/kvm/review-checklist.txt
+++ b/Documentation/virtual/kvm/review-checklist.txt
@@ -1,8 +1,8 @@
Review checklist for kvm patches
================================
-1. The patch must follow Documentation/CodingStyle and
- Documentation/SubmittingPatches.
+1. The patch must follow Documentation/process/coding-style.rst and
+ Documentation/process/submitting-patches.rst.
2. Patches should be against kvm.git master branch.
diff --git a/Documentation/virtual/uml/UserModeLinux-HOWTO.txt b/Documentation/virtual/uml/UserModeLinux-HOWTO.txt
index f4099ca6b483..87b80f589e1c 100644
--- a/Documentation/virtual/uml/UserModeLinux-HOWTO.txt
+++ b/Documentation/virtual/uml/UserModeLinux-HOWTO.txt
@@ -2401,9 +2401,9 @@
This takes one argument, which is a single letter. It calls the
generic kernel's SysRq driver, which does whatever is called for by
- that argument. See the SysRq documentation in Documentation/sysrq.txt
- in your favorite kernel tree to see what letters are valid and what
- they do.
+ that argument. See the SysRq documentation in
+ Documentation/admin-guide/sysrq.rst in your favorite kernel tree to
+ see what letters are valid and what they do.