summaryrefslogtreecommitdiff
path: root/include/linux/mm.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/mm.h')
-rw-r--r--include/linux/mm.h239
1 files changed, 193 insertions, 46 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c54fb96cb1e6..937bf719c329 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -27,6 +27,7 @@
#include <linux/memremap.h>
#include <linux/overflow.h>
#include <linux/sizes.h>
+#include <linux/sched.h>
struct mempolicy;
struct anon_vma;
@@ -356,10 +357,12 @@ extern unsigned int kobjsize(const void *objp);
/*
* Special vmas that are non-mergable, non-mlock()able.
- * Note: mm/huge_memory.c VM_NO_THP depends on this definition.
*/
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
+/* This mask prevents VMA from being scanned with khugepaged */
+#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
+
/* This mask defines which mm->def_flags a process can inherit its parent */
#define VM_INIT_DEF_MASK VM_NOHUGEPAGE
@@ -378,15 +381,75 @@ extern unsigned int kobjsize(const void *objp);
*/
extern pgprot_t protection_map[16];
-#define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
-#define FAULT_FLAG_MKWRITE 0x02 /* Fault was mkwrite of existing pte */
-#define FAULT_FLAG_ALLOW_RETRY 0x04 /* Retry fault if blocking */
-#define FAULT_FLAG_RETRY_NOWAIT 0x08 /* Don't drop mmap_sem and wait when retrying */
-#define FAULT_FLAG_KILLABLE 0x10 /* The fault task is in SIGKILL killable region */
-#define FAULT_FLAG_TRIED 0x20 /* Second try */
-#define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */
-#define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */
-#define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */
+/**
+ * Fault flag definitions.
+ *
+ * @FAULT_FLAG_WRITE: Fault was a write fault.
+ * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
+ * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
+ * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_sem and wait when retrying.
+ * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region.
+ * @FAULT_FLAG_TRIED: The fault has been tried once.
+ * @FAULT_FLAG_USER: The fault originated in userspace.
+ * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
+ * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
+ * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
+ *
+ * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
+ * whether we would allow page faults to retry by specifying these two
+ * fault flags correctly. Currently there can be three legal combinations:
+ *
+ * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and
+ * this is the first try
+ *
+ * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and
+ * we've already tried at least once
+ *
+ * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry
+ *
+ * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never
+ * be used. Note that page faults can be allowed to retry for multiple times,
+ * in which case we'll have an initial fault with flags (a) then later on
+ * continuous faults with flags (b). We should always try to detect pending
+ * signals before a retry to make sure the continuous page faults can still be
+ * interrupted if necessary.
+ */
+#define FAULT_FLAG_WRITE 0x01
+#define FAULT_FLAG_MKWRITE 0x02
+#define FAULT_FLAG_ALLOW_RETRY 0x04
+#define FAULT_FLAG_RETRY_NOWAIT 0x08
+#define FAULT_FLAG_KILLABLE 0x10
+#define FAULT_FLAG_TRIED 0x20
+#define FAULT_FLAG_USER 0x40
+#define FAULT_FLAG_REMOTE 0x80
+#define FAULT_FLAG_INSTRUCTION 0x100
+#define FAULT_FLAG_INTERRUPTIBLE 0x200
+
+/*
+ * The default fault flags that should be used by most of the
+ * arch-specific page fault handlers.
+ */
+#define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \
+ FAULT_FLAG_KILLABLE | \
+ FAULT_FLAG_INTERRUPTIBLE)
+
+/**
+ * fault_flag_allow_retry_first - check ALLOW_RETRY the first time
+ *
+ * This is mostly used for places where we want to try to avoid taking
+ * the mmap_sem for too long a time when waiting for another condition
+ * to change, in which case we can try to be polite to release the
+ * mmap_sem in the first round to avoid potential starvation of other
+ * processes that would also want the mmap_sem.
+ *
+ * Return: true if the page fault allows retry and this is the first
+ * attempt of the fault handling; false otherwise.
+ */
+static inline bool fault_flag_allow_retry_first(unsigned int flags)
+{
+ return (flags & FAULT_FLAG_ALLOW_RETRY) &&
+ (!(flags & FAULT_FLAG_TRIED));
+}
#define FAULT_FLAG_TRACE \
{ FAULT_FLAG_WRITE, "WRITE" }, \
@@ -397,7 +460,8 @@ extern pgprot_t protection_map[16];
{ FAULT_FLAG_TRIED, "TRIED" }, \
{ FAULT_FLAG_USER, "USER" }, \
{ FAULT_FLAG_REMOTE, "REMOTE" }, \
- { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }
+ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \
+ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }
/*
* vm_fault is filled by the the pagefault handler and passed to the vma's
@@ -541,6 +605,30 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma)
return !vma->vm_ops;
}
+static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
+{
+ int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
+
+ if (!maybe_stack)
+ return false;
+
+ if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
+ VM_STACK_INCOMPLETE_SETUP)
+ return true;
+
+ return false;
+}
+
+static inline bool vma_is_foreign(struct vm_area_struct *vma)
+{
+ if (!current->mm)
+ return true;
+
+ if (current->mm != vma->vm_mm)
+ return true;
+
+ return false;
+}
#ifdef CONFIG_SHMEM
/*
* The vma_is_shmem is not inline because it is used only by slow
@@ -770,6 +858,24 @@ static inline unsigned int compound_order(struct page *page)
return page[1].compound_order;
}
+static inline bool hpage_pincount_available(struct page *page)
+{
+ /*
+ * Can the page->hpage_pinned_refcount field be used? That field is in
+ * the 3rd page of the compound page, so the smallest (2-page) compound
+ * pages cannot support it.
+ */
+ page = compound_head(page);
+ return PageCompound(page) && compound_order(page) > 1;
+}
+
+static inline int compound_pincount(struct page *page)
+{
+ VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
+ page = compound_head(page);
+ return atomic_read(compound_pincount_ptr(page));
+}
+
static inline void set_compound_order(struct page *page, unsigned int order)
{
page[1].compound_order = order;
@@ -1001,6 +1107,8 @@ static inline void get_page(struct page *page)
page_ref_inc(page);
}
+bool __must_check try_grab_page(struct page *page, unsigned int flags);
+
static inline __must_check bool try_get_page(struct page *page)
{
page = compound_head(page);
@@ -1029,29 +1137,87 @@ static inline void put_page(struct page *page)
__put_page(page);
}
-/**
- * unpin_user_page() - release a gup-pinned page
- * @page: pointer to page to be released
+/*
+ * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
+ * the page's refcount so that two separate items are tracked: the original page
+ * reference count, and also a new count of how many pin_user_pages() calls were
+ * made against the page. ("gup-pinned" is another term for the latter).
*
- * Pages that were pinned via pin_user_pages*() must be released via either
- * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
- * that eventually such pages can be separately tracked and uniquely handled. In
- * particular, interactions with RDMA and filesystems need special handling.
+ * With this scheme, pin_user_pages() becomes special: such pages are marked as
+ * distinct from normal pages. As such, the unpin_user_page() call (and its
+ * variants) must be used in order to release gup-pinned pages.
*
- * unpin_user_page() and put_page() are not interchangeable, despite this early
- * implementation that makes them look the same. unpin_user_page() calls must
- * be perfectly matched up with pin*() calls.
+ * Choice of value:
+ *
+ * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
+ * counts with respect to pin_user_pages() and unpin_user_page() becomes
+ * simpler, due to the fact that adding an even power of two to the page
+ * refcount has the effect of using only the upper N bits, for the code that
+ * counts up using the bias value. This means that the lower bits are left for
+ * the exclusive use of the original code that increments and decrements by one
+ * (or at least, by much smaller values than the bias value).
+ *
+ * Of course, once the lower bits overflow into the upper bits (and this is
+ * OK, because subtraction recovers the original values), then visual inspection
+ * no longer suffices to directly view the separate counts. However, for normal
+ * applications that don't have huge page reference counts, this won't be an
+ * issue.
+ *
+ * Locking: the lockless algorithm described in page_cache_get_speculative()
+ * and page_cache_gup_pin_speculative() provides safe operation for
+ * get_user_pages and page_mkclean and other calls that race to set up page
+ * table entries.
*/
-static inline void unpin_user_page(struct page *page)
-{
- put_page(page);
-}
+#define GUP_PIN_COUNTING_BIAS (1U << 10)
+void unpin_user_page(struct page *page);
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
bool make_dirty);
-
void unpin_user_pages(struct page **pages, unsigned long npages);
+/**
+ * page_maybe_dma_pinned() - report if a page is pinned for DMA.
+ *
+ * This function checks if a page has been pinned via a call to
+ * pin_user_pages*().
+ *
+ * For non-huge pages, the return value is partially fuzzy: false is not fuzzy,
+ * because it means "definitely not pinned for DMA", but true means "probably
+ * pinned for DMA, but possibly a false positive due to having at least
+ * GUP_PIN_COUNTING_BIAS worth of normal page references".
+ *
+ * False positives are OK, because: a) it's unlikely for a page to get that many
+ * refcounts, and b) all the callers of this routine are expected to be able to
+ * deal gracefully with a false positive.
+ *
+ * For huge pages, the result will be exactly correct. That's because we have
+ * more tracking data available: the 3rd struct page in the compound page is
+ * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS
+ * scheme).
+ *
+ * For more information, please see Documentation/vm/pin_user_pages.rst.
+ *
+ * @page: pointer to page to be queried.
+ * @Return: True, if it is likely that the page has been "dma-pinned".
+ * False, if the page is definitely not dma-pinned.
+ */
+static inline bool page_maybe_dma_pinned(struct page *page)
+{
+ if (hpage_pincount_available(page))
+ return compound_pincount(page) > 0;
+
+ /*
+ * page_ref_count() is signed. If that refcount overflows, then
+ * page_ref_count() returns a negative value, and callers will avoid
+ * further incrementing the refcount.
+ *
+ * Here, for that overflow case, use the signed bit to count a little
+ * bit higher via unsigned math, and thus still get an accurate result.
+ */
+ return ((unsigned int)page_ref_count(compound_head(page))) >=
+ GUP_PIN_COUNTING_BIAS;
+}
+
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif
@@ -2364,26 +2530,7 @@ struct vm_unmapped_area_info {
unsigned long align_offset;
};
-extern unsigned long unmapped_area(struct vm_unmapped_area_info *info);
-extern unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info);
-
-/*
- * Search for an unmapped address range.
- *
- * We are looking for a range that:
- * - does not intersect with any VMA;
- * - is contained within the [low_limit, high_limit) interval;
- * - is at least the desired size.
- * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
- */
-static inline unsigned long
-vm_unmapped_area(struct vm_unmapped_area_info *info)
-{
- if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
- return unmapped_area_topdown(info);
- else
- return unmapped_area(info);
-}
+extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);
/* truncate.c */
extern void truncate_inode_pages(struct address_space *, loff_t);