From 32d118ad50a5afecb74358bcefc5cb6ea6ccfc2b Mon Sep 17 00:00:00 2001 From: Daniel Verkamp Date: Thu, 15 Dec 2022 00:12:02 +0000 Subject: selftests/memfd: add tests for F_SEAL_EXEC Basic tests to ensure that user/group/other execute bits cannot be changed after applying F_SEAL_EXEC to a memfd. Link: https://lkml.kernel.org/r/20221215001205.51969-3-jeffxu@google.com Signed-off-by: Daniel Verkamp Co-developed-by: Jeff Xu Signed-off-by: Jeff Xu Reviewed-by: Kees Cook Cc: David Herrmann Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: kernel test robot Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/memfd_test.c | 123 ++++++++++++++++++++++++++++- 1 file changed, 122 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 94df2692e6e4..f18a15a1f275 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -28,12 +28,38 @@ #define MFD_DEF_SIZE 8192 #define STACK_SIZE 65536 +#define F_SEAL_EXEC 0x0020 + /* * Default is not to test hugetlbfs */ static size_t mfd_def_size = MFD_DEF_SIZE; static const char *memfd_str = MEMFD_STR; +static ssize_t fd2name(int fd, char *buf, size_t bufsize) +{ + char buf1[PATH_MAX]; + int size; + ssize_t nbytes; + + size = snprintf(buf1, PATH_MAX, "/proc/self/fd/%d", fd); + if (size < 0) { + printf("snprintf(%d) failed on %m\n", fd); + abort(); + } + + /* + * reserver one byte for string termination. + */ + nbytes = readlink(buf1, buf, bufsize-1); + if (nbytes == -1) { + printf("readlink(%s) failed %m\n", buf1); + abort(); + } + buf[nbytes] = '\0'; + return nbytes; +} + static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) { int r, fd; @@ -98,11 +124,14 @@ static unsigned int mfd_assert_get_seals(int fd) static void mfd_assert_has_seals(int fd, unsigned int seals) { + char buf[PATH_MAX]; + int nbytes; unsigned int s; + fd2name(fd, buf, PATH_MAX); s = mfd_assert_get_seals(fd); if (s != seals) { - printf("%u != %u = GET_SEALS(%d)\n", seals, s, fd); + printf("%u != %u = GET_SEALS(%s)\n", seals, s, buf); abort(); } } @@ -594,6 +623,64 @@ static void mfd_fail_grow_write(int fd) } } +static void mfd_assert_mode(int fd, int mode) +{ + struct stat st; + char buf[PATH_MAX]; + int nbytes; + + fd2name(fd, buf, PATH_MAX); + + if (fstat(fd, &st) < 0) { + printf("fstat(%s) failed: %m\n", buf); + abort(); + } + + if ((st.st_mode & 07777) != mode) { + printf("fstat(%s) wrong file mode 0%04o, but expected 0%04o\n", + buf, (int)st.st_mode & 07777, mode); + abort(); + } +} + +static void mfd_assert_chmod(int fd, int mode) +{ + char buf[PATH_MAX]; + int nbytes; + + fd2name(fd, buf, PATH_MAX); + + if (fchmod(fd, mode) < 0) { + printf("fchmod(%s, 0%04o) failed: %m\n", buf, mode); + abort(); + } + + mfd_assert_mode(fd, mode); +} + +static void mfd_fail_chmod(int fd, int mode) +{ + struct stat st; + char buf[PATH_MAX]; + int nbytes; + + fd2name(fd, buf, PATH_MAX); + + if (fstat(fd, &st) < 0) { + printf("fstat(%s) failed: %m\n", buf); + abort(); + } + + if (fchmod(fd, mode) == 0) { + printf("fchmod(%s, 0%04o) didn't fail as expected\n", + buf, mode); + abort(); + } + + /* verify that file mode bits did not change */ + mfd_assert_mode(fd, st.st_mode & 07777); +} + static int idle_thread_fn(void *arg) { sigset_t set; @@ -880,6 +967,39 @@ static void test_seal_resize(void) close(fd); } +/* + * Test SEAL_EXEC + * Test that chmod() cannot change x bits after sealing + */ +static void test_seal_exec(void) +{ + int fd; + + printf("%s SEAL-EXEC\n", memfd_str); + + fd = mfd_assert_new("kern_memfd_seal_exec", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + mfd_assert_mode(fd, 0777); + + mfd_assert_chmod(fd, 0644); + + mfd_assert_has_seals(fd, 0); + mfd_assert_add_seals(fd, F_SEAL_EXEC); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + + mfd_assert_chmod(fd, 0600); + mfd_fail_chmod(fd, 0777); + mfd_fail_chmod(fd, 0670); + mfd_fail_chmod(fd, 0605); + mfd_fail_chmod(fd, 0700); + mfd_fail_chmod(fd, 0100); + mfd_assert_chmod(fd, 0666); + + close(fd); +} + /* * Test sharing via dup() * Test that seals are shared between dupped FDs and they're all equal. @@ -1059,6 +1179,7 @@ int main(int argc, char **argv) test_seal_shrink(); test_seal_grow(); test_seal_resize(); + test_seal_exec(); test_share_dup("SHARE-DUP", ""); test_share_mmap("SHARE-MMAP", ""); -- cgit v1.2.3 From 11f75a01448f1b7a739e75dbd8f17b844fcfc510 Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Thu, 15 Dec 2022 00:12:05 +0000 Subject: selftests/memfd: add tests for MFD_NOEXEC_SEAL MFD_EXEC Tests to verify MFD_NOEXEC, MFD_EXEC and vm.memfd_noexec sysctl. Link: https://lkml.kernel.org/r/20221215001205.51969-6-jeffxu@google.com Signed-off-by: Jeff Xu Co-developed-by: Daniel Verkamp Signed-off-by: Daniel Verkamp Reviewed-by: Kees Cook Cc: David Herrmann Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: kernel test robot Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/fuse_test.c | 1 + tools/testing/selftests/memfd/memfd_test.c | 228 ++++++++++++++++++++++++++++- 2 files changed, 224 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c index be675002f918..93798c8c5d54 100644 --- a/tools/testing/selftests/memfd/fuse_test.c +++ b/tools/testing/selftests/memfd/fuse_test.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index f18a15a1f275..ae71f15f790d 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -30,6 +30,14 @@ #define F_SEAL_EXEC 0x0020 +#define F_WX_SEALS (F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE | \ + F_SEAL_FUTURE_WRITE | \ + F_SEAL_EXEC) + +#define MFD_NOEXEC_SEAL 0x0008U + /* * Default is not to test hugetlbfs */ @@ -80,6 +88,37 @@ static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) return fd; } +static void sysctl_assert_write(const char *val) +{ + int fd = open("/proc/sys/vm/memfd_noexec", O_WRONLY | O_CLOEXEC); + + if (fd < 0) { + printf("open sysctl failed\n"); + abort(); + } + + if (write(fd, val, strlen(val)) < 0) { + printf("write sysctl failed\n"); + abort(); + } +} + +static void sysctl_fail_write(const char *val) +{ + int fd = open("/proc/sys/vm/memfd_noexec", O_WRONLY | O_CLOEXEC); + + if (fd < 0) { + printf("open sysctl failed\n"); + abort(); + } + + if (write(fd, val, strlen(val)) >= 0) { + printf("write sysctl %s succeeded, but failure expected\n", + val); + abort(); + } +} + static int mfd_assert_reopen_fd(int fd_in) { int fd; @@ -758,6 +797,9 @@ static void test_create(void) mfd_fail_new("", ~0); mfd_fail_new("", 0x80000000U); + /* verify EXEC and NOEXEC_SEAL can't both be set */ + mfd_fail_new("", MFD_EXEC | MFD_NOEXEC_SEAL); + /* verify MFD_CLOEXEC is allowed */ fd = mfd_assert_new("", 0, MFD_CLOEXEC); close(fd); @@ -969,20 +1011,21 @@ static void test_seal_resize(void) /* * Test SEAL_EXEC - * Test that chmod() cannot change x bits after sealing + * Test fd is created with exec and allow sealing. + * chmod() cannot change x bits after sealing. */ -static void test_seal_exec(void) +static void test_exec_seal(void) { int fd; printf("%s SEAL-EXEC\n", memfd_str); + printf("%s Apply SEAL_EXEC\n", memfd_str); fd = mfd_assert_new("kern_memfd_seal_exec", mfd_def_size, - MFD_CLOEXEC | MFD_ALLOW_SEALING); + MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_EXEC); mfd_assert_mode(fd, 0777); - mfd_assert_chmod(fd, 0644); mfd_assert_has_seals(fd, 0); @@ -996,10 +1039,181 @@ static void test_seal_exec(void) mfd_fail_chmod(fd, 0700); mfd_fail_chmod(fd, 0100); mfd_assert_chmod(fd, 0666); + mfd_assert_write(fd); + close(fd); + + printf("%s Apply ALL_SEALS\n", memfd_str); + fd = mfd_assert_new("kern_memfd_seal_exec", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_EXEC); + + mfd_assert_mode(fd, 0777); + mfd_assert_chmod(fd, 0700); + + mfd_assert_has_seals(fd, 0); + mfd_assert_add_seals(fd, F_SEAL_EXEC); + mfd_assert_has_seals(fd, F_WX_SEALS); + mfd_fail_chmod(fd, 0711); + mfd_fail_chmod(fd, 0600); + mfd_fail_write(fd); + close(fd); +} + +/* + * Test EXEC_NO_SEAL + * Test fd is created with exec and not allow sealing. + */ +static void test_exec_no_seal(void) +{ + int fd; + + printf("%s EXEC_NO_SEAL\n", memfd_str); + + /* Create with EXEC but without ALLOW_SEALING */ + fd = mfd_assert_new("kern_memfd_exec_no_sealing", + mfd_def_size, + MFD_CLOEXEC | MFD_EXEC); + mfd_assert_mode(fd, 0777); + mfd_assert_has_seals(fd, F_SEAL_SEAL); + mfd_assert_chmod(fd, 0666); close(fd); } +/* + * Test memfd_create with MFD_NOEXEC flag + */ +static void test_noexec_seal(void) +{ + int fd; + + printf("%s NOEXEC_SEAL\n", memfd_str); + + /* Create with NOEXEC and ALLOW_SEALING */ + fd = mfd_assert_new("kern_memfd_noexec", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_NOEXEC_SEAL); + mfd_assert_mode(fd, 0666); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + mfd_fail_chmod(fd, 0777); + close(fd); + + /* Create with NOEXEC but without ALLOW_SEALING */ + fd = mfd_assert_new("kern_memfd_noexec", + mfd_def_size, + MFD_CLOEXEC | MFD_NOEXEC_SEAL); + mfd_assert_mode(fd, 0666); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + mfd_fail_chmod(fd, 0777); + close(fd); +} + +static void test_sysctl_child(void) +{ + int fd; + + printf("%s sysctl 0\n", memfd_str); + sysctl_assert_write("0"); + fd = mfd_assert_new("kern_memfd_sysctl_0", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + mfd_assert_mode(fd, 0777); + mfd_assert_has_seals(fd, 0); + mfd_assert_chmod(fd, 0644); + close(fd); + + printf("%s sysctl 1\n", memfd_str); + sysctl_assert_write("1"); + fd = mfd_assert_new("kern_memfd_sysctl_1", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + mfd_assert_mode(fd, 0666); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + mfd_fail_chmod(fd, 0777); + sysctl_fail_write("0"); + close(fd); + + printf("%s sysctl 2\n", memfd_str); + sysctl_assert_write("2"); + mfd_fail_new("kern_memfd_sysctl_2", + MFD_CLOEXEC | MFD_ALLOW_SEALING); + sysctl_fail_write("0"); + sysctl_fail_write("1"); +} + +static int newpid_thread_fn(void *arg) +{ + test_sysctl_child(); + return 0; +} + +static void test_sysctl_child2(void) +{ + int fd; + + sysctl_fail_write("0"); + fd = mfd_assert_new("kern_memfd_sysctl_1", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + mfd_assert_mode(fd, 0666); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + mfd_fail_chmod(fd, 0777); + close(fd); +} + +static int newpid_thread_fn2(void *arg) +{ + test_sysctl_child2(); + return 0; +} +static pid_t spawn_newpid_thread(unsigned int flags, int (*fn)(void *)) +{ + uint8_t *stack; + pid_t pid; + + stack = malloc(STACK_SIZE); + if (!stack) { + printf("malloc(STACK_SIZE) failed: %m\n"); + abort(); + } + + pid = clone(fn, + stack + STACK_SIZE, + SIGCHLD | flags, + NULL); + if (pid < 0) { + printf("clone() failed: %m\n"); + abort(); + } + + return pid; +} + +static void join_newpid_thread(pid_t pid) +{ + waitpid(pid, NULL, 0); +} + +/* + * Test sysctl + * A very basic sealing test to see whether setting/retrieving seals works. + */ +static void test_sysctl(void) +{ + int pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn); + + join_newpid_thread(pid); + + printf("%s child ns\n", memfd_str); + sysctl_assert_write("1"); + + pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn2); + join_newpid_thread(pid); +} + /* * Test sharing via dup() * Test that seals are shared between dupped FDs and they're all equal. @@ -1173,13 +1387,15 @@ int main(int argc, char **argv) test_create(); test_basic(); + test_exec_seal(); + test_exec_no_seal(); + test_noexec_seal(); test_seal_write(); test_seal_future_write(); test_seal_shrink(); test_seal_grow(); test_seal_resize(); - test_seal_exec(); test_share_dup("SHARE-DUP", ""); test_share_mmap("SHARE-MMAP", ""); @@ -1195,6 +1411,8 @@ int main(int argc, char **argv) test_share_fork("SHARE-FORK", SHARED_FT_STR); join_idle_thread(pid); + test_sysctl(); + printf("memfd: DONE\n"); return 0; -- cgit v1.2.3 From 553b014244298d9f807286d6a71d722bc1f50f84 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:28 +0000 Subject: selftests/damon/sysfs: test filters directory Add simple test cases for scheme filters of DAMON sysfs interface. The test cases check if the files are populated as expected, receives some valid inputs, and refuses some invalid inputs. Link: https://lkml.kernel.org/r/20221205230830.144349-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.sh | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index db4942383a50..a00336ffdcad 100644 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -96,6 +96,34 @@ test_stats() done } +test_filter() +{ + filter_dir=$1 + ensure_file "$filter_dir/type" "exist" "600" + ensure_write_succ "$filter_dir/type" "anon" "valid input" + ensure_write_succ "$filter_dir/type" "memcg" "valid input" + ensure_write_fail "$filter_dir/type" "foo" "invalid input" + ensure_file "$filter_dir/matching" "exist" "600" + ensure_file "$filter_dir/memcg_path" "exist" "600" +} + +test_filters() +{ + filters_dir=$1 + ensure_dir "$filters_dir" "exist" + ensure_file "$filters_dir/nr_filters" "exist" "600" + ensure_write_succ "$filters_dir/nr_filters" "1" "valid input" + test_filter "$filters_dir/0" + + ensure_write_succ "$filters_dir/nr_filters" "2" "valid input" + test_filter "$filters_dir/0" + test_filter "$filters_dir/1" + + ensure_write_succ "$filters_dir/nr_filters" "0" "valid input" + ensure_dir "$filters_dir/0" "not_exist" + ensure_dir "$filters_dir/1" "not_exist" +} + test_watermarks() { watermarks_dir=$1 @@ -143,6 +171,7 @@ test_scheme() test_access_pattern "$scheme_dir/access_pattern" test_quotas "$scheme_dir/quotas" test_watermarks "$scheme_dir/watermarks" + test_filters "$scheme_dir/filters" test_stats "$scheme_dir/stats" test_tried_regions "$scheme_dir/tried_regions" } -- cgit v1.2.3 From ef1faf0e370a8e33fe625088ddc5fde02cf8c4c4 Mon Sep 17 00:00:00 2001 From: Jianlin Lv Date: Mon, 19 Dec 2022 16:49:17 +0000 Subject: tools/vm/page_owner_sort: free memory before exit Although when a process terminates, the kernel will removes memory associated with that process, It's neither good style nor proper design to leave it to kernel. This patch free allocated memory before process exit. Link: https://lkml.kernel.org/r/20221219164917.14132-1-iecedge@gmail.com Signed-off-by: Jianlin Lv Signed-off-by: Andrew Morton --- tools/vm/page_owner_sort.c | 65 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index ce860ab94162..7c2ac124cdc8 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -246,15 +246,16 @@ static int search_pattern(regex_t *pattern, char *pattern_str, char *buf) return 0; } -static void check_regcomp(regex_t *pattern, const char *regex) +static bool check_regcomp(regex_t *pattern, const char *regex) { int err; err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE); if (err != 0 || pattern->re_nsub != 1) { fprintf(stderr, "Invalid pattern %s code %d\n", regex, err); - exit(1); + return false; } + return true; } static char **explode(char sep, const char *str, int *size) @@ -494,28 +495,28 @@ static bool is_need(char *buf) return true; } -static void add_list(char *buf, int len, char *ext_buf) +static bool add_list(char *buf, int len, char *ext_buf) { if (list_size != 0 && len == list[list_size-1].len && memcmp(buf, list[list_size-1].txt, len) == 0) { list[list_size-1].num++; list[list_size-1].page_num += get_page_num(buf); - return; + return true; } if (list_size == max_size) { fprintf(stderr, "max_size too small??\n"); - exit(1); + return false; } if (!is_need(buf)) - return; + return true; list[list_size].pid = get_pid(buf); list[list_size].tgid = get_tgid(buf); list[list_size].comm = get_comm(buf); list[list_size].txt = malloc(len+1); if (!list[list_size].txt) { fprintf(stderr, "Out of memory\n"); - exit(1); + return false; } memcpy(list[list_size].txt, buf, len); list[list_size].txt[len] = 0; @@ -534,6 +535,7 @@ static void add_list(char *buf, int len, char *ext_buf) printf("loaded %d\r", list_size); fflush(stdout); } + return true; } static bool parse_cull_args(const char *arg_str) @@ -790,12 +792,19 @@ int main(int argc, char **argv) exit(1); } - check_regcomp(&order_pattern, "order\\s*([0-9]*),"); - check_regcomp(&pid_pattern, "pid\\s*([0-9]*),"); - check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) "); - check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts"); - check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,"); - check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns"); + if (!check_regcomp(&order_pattern, "order\\s*([0-9]*),")) + goto out_order; + if (!check_regcomp(&pid_pattern, "pid\\s*([0-9]*),")) + goto out_pid; + if (!check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) ")) + goto out_tgid; + if (!check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts")) + goto out_comm; + if (!check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,")) + goto out_ts; + if (!check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns")) + goto out_free_ts; + fstat(fileno(fin), &st); max_size = st.st_size / 100; /* hack ... */ @@ -804,7 +813,7 @@ int main(int argc, char **argv) ext_buf = malloc(BUF_SIZE); if (!list || !buf || !ext_buf) { fprintf(stderr, "Out of memory\n"); - exit(1); + goto out_free; } for ( ; ; ) { @@ -812,7 +821,8 @@ int main(int argc, char **argv) if (buf_len < 0) break; - add_list(buf, buf_len, ext_buf); + if (!add_list(buf, buf_len, ext_buf)) + goto out_free; } printf("loaded %d\n", list_size); @@ -862,11 +872,26 @@ int main(int argc, char **argv) fprintf(fout, "\n"); } } - regfree(&order_pattern); - regfree(&pid_pattern); - regfree(&tgid_pattern); - regfree(&comm_pattern); - regfree(&ts_nsec_pattern); + +out_free: + if (ext_buf) + free(ext_buf); + if (buf) + free(buf); + if (list) + free(list); +out_free_ts: regfree(&free_ts_nsec_pattern); +out_ts: + regfree(&ts_nsec_pattern); +out_comm: + regfree(&comm_pattern); +out_tgid: + regfree(&tgid_pattern); +out_pid: + regfree(&pid_pattern); +out_order: + regfree(&order_pattern); + return 0; } -- cgit v1.2.3 From a9af8e6bb3e5de8ea9d29c1d318bcfbc5667c939 Mon Sep 17 00:00:00 2001 From: Xu Panda Date: Fri, 23 Dec 2022 10:50:24 +0800 Subject: selftests/vm: ksm_functional_tests: fix a typo in comment Fix a typo of "comaring" which should be "comparing". Link: https://lkml.kernel.org/r/202212231050245952617@zte.com.cn Signed-off-by: Xu Panda Signed-off-by: xu xin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/ksm_functional_tests.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/vm/ksm_functional_tests.c index b11b7e5115dc..d8b5b4930412 100644 --- a/tools/testing/selftests/vm/ksm_functional_tests.c +++ b/tools/testing/selftests/vm/ksm_functional_tests.c @@ -37,7 +37,7 @@ static bool range_maps_duplicates(char *addr, unsigned long size) /* * There is no easy way to check if there are KSM pages mapped into * this range. We only check that the range does not map the same PFN - * twice by comaring each pair of mapped pages. + * twice by comparing each pair of mapped pages. */ for (offs_a = 0; offs_a < size; offs_a += pagesize) { pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a); -- cgit v1.2.3 From 541e06b772c1aaffb3b6a245ccface36d7107af2 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Thu, 5 Jan 2023 16:05:34 +0000 Subject: maple_tree: remove GFP_ZERO from kmem_cache_alloc() and kmem_cache_alloc_bulk() Preallocations are common in the VMA code to avoid allocating under certain locking conditions. The preallocations must also cover the worst-case scenario. Removing the GFP_ZERO flag from the kmem_cache_alloc() (and bulk variant) calls will reduce the amount of time spent zeroing memory that may not be used. Only zero out the necessary area to keep track of the allocations in the maple state. Zero the entire node prior to using it in the tree. This required internal changes to node counting on allocation, so the test code is also updated. This restores some micro-benchmark performance: up to +9% in mmtests mmap1 by my testing +10% to +20% in mmap, mmapaddr, mmapmany tests reported by Red Hat Link: https://bugzilla.redhat.com/show_bug.cgi?id=2149636 Link: https://lkml.kernel.org/r/20230105160427.2988454-1-Liam.Howlett@oracle.com Signed-off-by: Liam Howlett Reported-by: Jirka Hladky Suggested-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- lib/maple_tree.c | 80 +++++++++++++++++++++------------------- tools/testing/radix-tree/maple.c | 18 ++++----- 2 files changed, 52 insertions(+), 46 deletions(-) (limited to 'tools') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 94f0053ec3e0..8db3c336d19f 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -149,13 +149,12 @@ struct maple_subtree_state { /* Functions */ static inline struct maple_node *mt_alloc_one(gfp_t gfp) { - return kmem_cache_alloc(maple_node_cache, gfp | __GFP_ZERO); + return kmem_cache_alloc(maple_node_cache, gfp); } static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) { - return kmem_cache_alloc_bulk(maple_node_cache, gfp | __GFP_ZERO, size, - nodes); + return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); } static inline void mt_free_bulk(size_t size, void __rcu **nodes) @@ -1125,9 +1124,10 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) { struct maple_alloc *ret, *node = mas->alloc; unsigned long total = mas_allocated(mas); + unsigned int req = mas_alloc_req(mas); /* nothing or a request pending. */ - if (unlikely(!total)) + if (WARN_ON(!total)) return NULL; if (total == 1) { @@ -1137,27 +1137,25 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) goto single_node; } - if (!node->node_count) { + if (node->node_count == 1) { /* Single allocation in this node. */ mas->alloc = node->slot[0]; - node->slot[0] = NULL; mas->alloc->total = node->total - 1; ret = node; goto new_head; } - node->total--; - ret = node->slot[node->node_count]; - node->slot[node->node_count--] = NULL; + ret = node->slot[--node->node_count]; + node->slot[node->node_count] = NULL; single_node: new_head: - ret->total = 0; - ret->node_count = 0; - if (ret->request_count) { - mas_set_alloc_req(mas, ret->request_count + 1); - ret->request_count = 0; + if (req) { + req++; + mas_set_alloc_req(mas, req); } + + memset(ret, 0, sizeof(*ret)); return (struct maple_node *)ret; } @@ -1176,21 +1174,20 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) unsigned long count; unsigned int requested = mas_alloc_req(mas); - memset(reuse, 0, sizeof(*reuse)); count = mas_allocated(mas); - if (count && (head->node_count < MAPLE_ALLOC_SLOTS - 1)) { - if (head->slot[0]) - head->node_count++; - head->slot[head->node_count] = reuse; + reuse->request_count = 0; + reuse->node_count = 0; + if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) { + head->slot[head->node_count++] = reuse; head->total++; goto done; } reuse->total = 1; if ((head) && !((unsigned long)head & 0x1)) { - head->request_count = 0; reuse->slot[0] = head; + reuse->node_count = 1; reuse->total += head->total; } @@ -1209,7 +1206,6 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { struct maple_alloc *node; unsigned long allocated = mas_allocated(mas); - unsigned long success = allocated; unsigned int requested = mas_alloc_req(mas); unsigned int count; void **slots = NULL; @@ -1225,24 +1221,29 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) WARN_ON(!allocated); } - if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS - 1) { + if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) { node = (struct maple_alloc *)mt_alloc_one(gfp); if (!node) goto nomem_one; - if (allocated) + if (allocated) { node->slot[0] = mas->alloc; + node->node_count = 1; + } else { + node->node_count = 0; + } - success++; mas->alloc = node; + node->total = ++allocated; requested--; } node = mas->alloc; + node->request_count = 0; while (requested) { max_req = MAPLE_ALLOC_SLOTS; - if (node->slot[0]) { - unsigned int offset = node->node_count + 1; + if (node->node_count) { + unsigned int offset = node->node_count; slots = (void **)&node->slot[offset]; max_req -= offset; @@ -1256,15 +1257,13 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) goto nomem_bulk; node->node_count += count; - /* zero indexed. */ - if (slots == (void **)&node->slot) - node->node_count--; - - success += count; + allocated += count; node = node->slot[0]; + node->node_count = 0; + node->request_count = 0; requested -= count; } - mas->alloc->total = success; + mas->alloc->total = allocated; return; nomem_bulk: @@ -1273,7 +1272,7 @@ nomem_bulk: nomem_one: mas_set_alloc_req(mas, requested); if (mas->alloc && !(((unsigned long)mas->alloc & 0x1))) - mas->alloc->total = success; + mas->alloc->total = allocated; mas_set_err(mas, -ENOMEM); } @@ -5734,6 +5733,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) void mas_destroy(struct ma_state *mas) { struct maple_alloc *node; + unsigned long total; /* * When using mas_for_each() to insert an expected number of elements, @@ -5756,14 +5756,20 @@ void mas_destroy(struct ma_state *mas) } mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC); - while (mas->alloc && !((unsigned long)mas->alloc & 0x1)) { + total = mas_allocated(mas); + while (total) { node = mas->alloc; mas->alloc = node->slot[0]; - if (node->node_count > 0) - mt_free_bulk(node->node_count, - (void __rcu **)&node->slot[1]); + if (node->node_count > 1) { + size_t count = node->node_count - 1; + + mt_free_bulk(count, (void __rcu **)&node->slot[1]); + total -= count; + } kmem_cache_free(maple_node_cache, node); + total--; } + mas->alloc = NULL; } EXPORT_SYMBOL_GPL(mas_destroy); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 81fa7ec2e66a..1f36bc1c5d36 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -173,11 +173,11 @@ static noinline void check_new_node(struct maple_tree *mt) if (!MAPLE_32BIT) { if (i >= 35) - e = i - 35; + e = i - 34; else if (i >= 5) - e = i - 5; + e = i - 4; else if (i >= 2) - e = i - 2; + e = i - 1; } else { if (i >= 4) e = i - 4; @@ -305,17 +305,17 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); mn = mas_pop_node(&mas); /* get the next node. */ MT_BUG_ON(mt, mn == NULL); MT_BUG_ON(mt, not_empty(mn)); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 2); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); mas_push_node(&mas, mn); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); /* Check the limit of pop/push/pop */ mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 2); /* Request */ @@ -323,14 +323,14 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); MT_BUG_ON(mt, mas_alloc_req(&mas)); - MT_BUG_ON(mt, mas.alloc->node_count); + MT_BUG_ON(mt, mas.alloc->node_count != 1); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn)); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas.alloc->node_count); + MT_BUG_ON(mt, mas.alloc->node_count != 1); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn)); -- cgit v1.2.3 From dee2ad120571f38433211098cd6b95a59bdfc8c7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 4 Jan 2023 15:49:05 +0100 Subject: selftests/vm: cow: add COW tests for collapsing of PTE-mapped anon THP Currently, anonymous PTE-mapped THPs cannot be collapsed in-place: collapsing (e.g., via MADV_COLLAPSE) implies allocating a fresh THP and mapping that new THP via a PMD: as it's a fresh anon THP, it will get the exclusive flag set on the head page and everybody is happy. However, if the kernel would ever support in-place collapse of anonymous THPs (replacing a page table mapping each sub-page of a THP via PTEs with a single PMD mapping the complete THP), exclusivity information stored for each sub-page would have to be collapsed accordingly: (1) All PTEs map !exclusive anon sub-pages: the in-place collapsed THP must not not have the exclusive flag set on the head page mapped by the PMD. This is the easiest case to handle ("simply don't set any exclusive flags"). (2) All PTEs map exclusive anon sub-pages: when collapsing, we have to clear the exclusive flag from all tail pages and only leave the exclusive flag set for the head page. Otherwise, fork() after collapse would not clear the exclusive flags from the tail pages and we'd be in trouble once PTE-mapping the shared THP when writing to shared tail pages that still have the exclusive flag set. This would effectively revert what the PTE-mapping code does when propagating the exclusive flag to all sub-pages. (3) PTEs map a mixture of exclusive and !exclusive anon sub-pages (can happen e.g., due to MADV_DONTFORK before fork()). We must not collapse the THP in-place, otherwise bad things may happen: the exclusive flags of sub-pages would get ignored and the exclusive flag of the head page would get used instead. Now that we have MADV_COLLAPSE in place to trigger collapsing a THP, let's add some test cases that would bail out early, if we'd voluntarily/accidantially unlock in-place collapse for anon THPs and forget about taking proper care of exclusive flags. Running the test on a kernel with MADV_COLLAPSE support: # [INFO] Anonymous THP tests # [RUN] Basic COW after fork() when collapsing before fork() ok 169 No leak from parent into child # [RUN] Basic COW after fork() when collapsing after fork() (fully shared) ok 170 # SKIP MADV_COLLAPSE failed: Invalid argument # [RUN] Basic COW after fork() when collapsing after fork() (lower shared) ok 171 No leak from parent into child # [RUN] Basic COW after fork() when collapsing after fork() (upper shared) ok 172 No leak from parent into child For now, MADV_COLLAPSE always seems to fail if all PTEs map shared sub-pages. Link: https://lkml.kernel.org/r/20230104144905.460075-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Shuah Khan Cc: Hugh Dickins Cc: Peter Xu Cc: Vlastimil Babka Cc: Nadav Amit Cc: Zach O'Keefe Cc: Andrea Arcangeli Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/cow.c | 228 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 228 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/vm/cow.c b/tools/testing/selftests/vm/cow.c index 26f6ea3079e2..16216d893d96 100644 --- a/tools/testing/selftests/vm/cow.c +++ b/tools/testing/selftests/vm/cow.c @@ -30,6 +30,10 @@ #include "../kselftest.h" #include "vm_util.h" +#ifndef MADV_COLLAPSE +#define MADV_COLLAPSE 25 +#endif + static size_t pagesize; static int pagemap_fd; static size_t thpsize; @@ -1178,6 +1182,228 @@ static int tests_per_anon_test_case(void) return tests; } +enum anon_thp_collapse_test { + ANON_THP_COLLAPSE_UNSHARED, + ANON_THP_COLLAPSE_FULLY_SHARED, + ANON_THP_COLLAPSE_LOWER_SHARED, + ANON_THP_COLLAPSE_UPPER_SHARED, +}; + +static void do_test_anon_thp_collapse(char *mem, size_t size, + enum anon_thp_collapse_test test) +{ + struct comm_pipes comm_pipes; + char buf; + int ret; + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + return; + } + + /* + * Trigger PTE-mapping the THP by temporarily mapping a single subpage + * R/O, such that we can try collapsing it later. + */ + ret = mprotect(mem + pagesize, pagesize, PROT_READ); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + /* Collapse before actually COW-sharing the page. */ + ret = madvise(mem, size, MADV_COLLAPSE); + if (ret) { + ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", + strerror(errno)); + goto close_comm_pipes; + } + break; + case ANON_THP_COLLAPSE_FULLY_SHARED: + /* COW-share the full PTE-mapped THP. */ + break; + case ANON_THP_COLLAPSE_LOWER_SHARED: + /* Don't COW-share the upper part of the THP. */ + ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto close_comm_pipes; + } + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + /* Don't COW-share the lower part of the THP. */ + ret = madvise(mem, size / 2, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_comm_pipes; + } else if (!ret) { + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + case ANON_THP_COLLAPSE_FULLY_SHARED: + exit(child_memcmp_fn(mem, size, &comm_pipes)); + break; + case ANON_THP_COLLAPSE_LOWER_SHARED: + exit(child_memcmp_fn(mem, size / 2, &comm_pipes)); + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + exit(child_memcmp_fn(mem + size / 2, size / 2, + &comm_pipes)); + break; + default: + assert(false); + } + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + case ANON_THP_COLLAPSE_LOWER_SHARED: + /* + * Revert MADV_DONTFORK such that we merge the VMAs and are + * able to actually collapse. + */ + ret = madvise(mem, size, MADV_DOFORK); + if (ret) { + ksft_test_result_fail("MADV_DOFORK failed\n"); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + /* FALLTHROUGH */ + case ANON_THP_COLLAPSE_FULLY_SHARED: + /* Collapse before anyone modified the COW-shared page. */ + ret = madvise(mem, size, MADV_COLLAPSE); + if (ret) { + ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", + strerror(errno)); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + /* Modify the page. */ + memset(mem, 0xff, size); + write(comm_pipes.parent_ready[1], "0", 1); + + wait(&ret); + if (WIFEXITED(ret)) + ret = WEXITSTATUS(ret); + else + ret = -EINVAL; + + ksft_test_result(!ret, "No leak from parent into child\n"); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +} + +static void test_anon_thp_collapse_unshared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED); +} + +static void test_anon_thp_collapse_fully_shared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED); +} + +static void test_anon_thp_collapse_lower_shared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED); +} + +static void test_anon_thp_collapse_upper_shared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED); +} + +/* + * Test cases that are specific to anonymous THP: pages in private mappings + * that may get shared via COW during fork(). + */ +static const struct test_case anon_thp_test_cases[] = { + /* + * Basic COW test for fork() without any GUP when collapsing a THP + * before fork(). + * + * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place + * collapse") might easily get COW handling wrong when not collapsing + * exclusivity information properly. + */ + { + "Basic COW after fork() when collapsing before fork()", + test_anon_thp_collapse_unshared, + }, + /* Basic COW test, but collapse after COW-sharing a full THP. */ + { + "Basic COW after fork() when collapsing after fork() (fully shared)", + test_anon_thp_collapse_fully_shared, + }, + /* + * Basic COW test, but collapse after COW-sharing the lower half of a + * THP. + */ + { + "Basic COW after fork() when collapsing after fork() (lower shared)", + test_anon_thp_collapse_lower_shared, + }, + /* + * Basic COW test, but collapse after COW-sharing the upper half of a + * THP. + */ + { + "Basic COW after fork() when collapsing after fork() (upper shared)", + test_anon_thp_collapse_upper_shared, + }, +}; + +static void run_anon_thp_test_cases(void) +{ + int i; + + if (!thpsize) + return; + + ksft_print_msg("[INFO] Anonymous THP tests\n"); + + for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) { + struct test_case const *test_case = &anon_thp_test_cases[i]; + + ksft_print_msg("[RUN] %s\n", test_case->desc); + do_run_with_thp(test_case->fn, THP_RUN_PMD); + } +} + +static int tests_per_anon_thp_test_case(void) +{ + return thpsize ? 1 : 0; +} + typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size); static void test_cow(char *mem, const char *smem, size_t size) @@ -1518,6 +1744,7 @@ int main(int argc, char **argv) ksft_print_header(); ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() + + ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() + ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case()); gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); @@ -1526,6 +1753,7 @@ int main(int argc, char **argv) ksft_exit_fail_msg("opening pagemap failed\n"); run_anon_test_cases(); + run_anon_thp_test_cases(); run_non_anon_test_cases(); err = ksft_get_fail_cnt(); -- cgit v1.2.3 From 799fb82aa132fa3a3886b7872997a5a84e820062 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 3 Jan 2023 18:07:52 +0000 Subject: tools/vm: rename tools/vm to tools/mm Rename tools/vm to tools/mm for being more consistent with the code and documentation directories, and won't be confused with virtual machines. Link: https://lkml.kernel.org/r/20230103180754.129637-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../admin-guide/mm/idle_page_tracking.rst | 2 +- Documentation/admin-guide/mm/pagemap.rst | 4 +- Documentation/mm/page_owner.rst | 2 +- Documentation/mm/slub.rst | 2 +- Documentation/translations/zh_CN/mm/page_owner.rst | 2 +- MAINTAINERS | 2 +- mm/Kconfig.debug | 2 +- mm/memory-failure.c | 2 +- tools/mm/.gitignore | 4 + tools/mm/Makefile | 32 + tools/mm/page-types.c | 1396 ++++++++++++++++++ tools/mm/page_owner_sort.c | 897 ++++++++++++ tools/mm/slabinfo-gnuplot.sh | 268 ++++ tools/mm/slabinfo.c | 1544 ++++++++++++++++++++ tools/vm/.gitignore | 4 - tools/vm/Makefile | 32 - tools/vm/page-types.c | 1396 ------------------ tools/vm/page_owner_sort.c | 897 ------------ tools/vm/slabinfo-gnuplot.sh | 268 ---- tools/vm/slabinfo.c | 1544 -------------------- 20 files changed, 4150 insertions(+), 4150 deletions(-) create mode 100644 tools/mm/.gitignore create mode 100644 tools/mm/Makefile create mode 100644 tools/mm/page-types.c create mode 100644 tools/mm/page_owner_sort.c create mode 100644 tools/mm/slabinfo-gnuplot.sh create mode 100644 tools/mm/slabinfo.c delete mode 100644 tools/vm/.gitignore delete mode 100644 tools/vm/Makefile delete mode 100644 tools/vm/page-types.c delete mode 100644 tools/vm/page_owner_sort.c delete mode 100644 tools/vm/slabinfo-gnuplot.sh delete mode 100644 tools/vm/slabinfo.c (limited to 'tools') diff --git a/Documentation/admin-guide/mm/idle_page_tracking.rst b/Documentation/admin-guide/mm/idle_page_tracking.rst index df9394fb39c2..19492064278c 100644 --- a/Documentation/admin-guide/mm/idle_page_tracking.rst +++ b/Documentation/admin-guide/mm/idle_page_tracking.rst @@ -65,7 +65,7 @@ workload one should: are not reclaimable, he or she can filter them out using ``/proc/kpageflags``. -The page-types tool in the tools/vm directory can be used to assist in this. +The page-types tool in the tools/mm directory can be used to assist in this. If the tool is run initially with the appropriate option, it will mark all the queried pages as idle. Subsequent runs of the tool can then show which pages have their idle flag cleared in the interim. diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index 6e2e416af783..ceb5da3172ba 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -46,7 +46,7 @@ There are four components to pagemap: * ``/proc/kpagecount``. This file contains a 64-bit count of the number of times each page is mapped, indexed by PFN. -The page-types tool in the tools/vm directory can be used to query the +The page-types tool in the tools/mm directory can be used to query the number of times a page is mapped. * ``/proc/kpageflags``. This file contains a 64-bit set of flags for each @@ -173,7 +173,7 @@ LRU related page flags 14 - SWAPBACKED The page is backed by swap/RAM. -The page-types tool in the tools/vm directory can be used to query the +The page-types tool in the tools/mm directory can be used to query the above flags. Using pagemap to do something useful diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst index 127514955a5e..5df26c0a0c1f 100644 --- a/Documentation/mm/page_owner.rst +++ b/Documentation/mm/page_owner.rst @@ -61,7 +61,7 @@ Usage 1) Build user-space helper:: - cd tools/vm + cd tools/mm make page_owner_sort 2) Enable page owner: add "page_owner=on" to boot cmdline. diff --git a/Documentation/mm/slub.rst b/Documentation/mm/slub.rst index 7f652216dabe..3ffa7eded251 100644 --- a/Documentation/mm/slub.rst +++ b/Documentation/mm/slub.rst @@ -21,7 +21,7 @@ slabs that have data in them. See "slabinfo -h" for more options when running the command. ``slabinfo`` can be compiled with :: - gcc -o slabinfo tools/vm/slabinfo.c + gcc -o slabinfo tools/mm/slabinfo.c Some of the modes of operation of ``slabinfo`` require that slub debugging be enabled on the command line. F.e. no tracking information will be diff --git a/Documentation/translations/zh_CN/mm/page_owner.rst b/Documentation/translations/zh_CN/mm/page_owner.rst index 21a6a0837d42..4d3b2c33e4ef 100644 --- a/Documentation/translations/zh_CN/mm/page_owner.rst +++ b/Documentation/translations/zh_CN/mm/page_owner.rst @@ -62,7 +62,7 @@ page owner在默认情况下是禁用的。所以,如果你想使用它,你 1) 构建用户空间的帮助:: - cd tools/vm + cd tools/mm make page_owner_sort 2) 启用page owner: 添加 "page_owner=on" 到 boot cmdline. diff --git a/MAINTAINERS b/MAINTAINERS index c05f95aa7af1..c726adfd1f0d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13483,8 +13483,8 @@ F: include/linux/mm.h F: include/linux/mmzone.h F: include/linux/pagewalk.h F: mm/ +F: tools/mm/ F: tools/testing/selftests/vm/ -F: tools/vm/ VMALLOC M: Andrew Morton diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index fca699ad1fb0..d62f48131952 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -90,7 +90,7 @@ config PAGE_OWNER help to find bare alloc_page(s) leaks. Even if you include this feature on your build, it is disabled in default. You should pass "page_owner=on" to boot parameter in order to enable it. Eats - a fair amount of memory if enabled. See tools/vm/page_owner_sort.c + a fair amount of memory if enabled. See tools/mm/page_owner_sort.c for user-space helper. If unsure, say N. diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c77a9e37e27e..6bf07345ea2c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -24,7 +24,7 @@ * - You have a test that can be added to mce-test * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ * - The case actually shows up as a frequent (top 10) page state in - * tools/vm/page-types when running a real workload. + * tools/mm/page-types when running a real workload. * * There are several operations here with exponential complexity because * of unsuitable VM data structures. For example the operation to map back diff --git a/tools/mm/.gitignore b/tools/mm/.gitignore new file mode 100644 index 000000000000..922879f93fc8 --- /dev/null +++ b/tools/mm/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +slabinfo +page-types +page_owner_sort diff --git a/tools/mm/Makefile b/tools/mm/Makefile new file mode 100644 index 000000000000..9860622cbb15 --- /dev/null +++ b/tools/mm/Makefile @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for vm tools +# +include ../scripts/Makefile.include + +TARGETS=page-types slabinfo page_owner_sort + +LIB_DIR = ../lib/api +LIBS = $(LIB_DIR)/libapi.a + +CFLAGS = -Wall -Wextra -I../lib/ +LDFLAGS = $(LIBS) + +all: $(TARGETS) + +$(TARGETS): $(LIBS) + +$(LIBS): + make -C $(LIB_DIR) + +%: %.c + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +clean: + $(RM) page-types slabinfo page_owner_sort + make -C $(LIB_DIR) clean + +sbindir ?= /usr/sbin + +install: all + install -d $(DESTDIR)$(sbindir) + install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir) diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c new file mode 100644 index 000000000000..381dcc00cb62 --- /dev/null +++ b/tools/mm/page-types.c @@ -0,0 +1,1396 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * page-types: Tool for querying page flags + * + * Copyright (C) 2009 Intel corporation + * + * Authors: Wu Fengguang + */ + +#define _FILE_OFFSET_BITS 64 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../../include/uapi/linux/magic.h" +#include "../../include/uapi/linux/kernel-page-flags.h" +#include + +#ifndef MAX_PATH +# define MAX_PATH 256 +#endif + +#ifndef STR +# define _STR(x) #x +# define STR(x) _STR(x) +#endif + +/* + * pagemap kernel ABI bits + */ + +#define PM_ENTRY_BYTES 8 +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK ((1LL << PM_PFRAME_BITS) - 1) +#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) +#define MAX_SWAPFILES_SHIFT 5 +#define PM_SWAP_OFFSET(x) (((x) & PM_PFRAME_MASK) >> MAX_SWAPFILES_SHIFT) +#define PM_SOFT_DIRTY (1ULL << 55) +#define PM_MMAP_EXCLUSIVE (1ULL << 56) +#define PM_FILE (1ULL << 61) +#define PM_SWAP (1ULL << 62) +#define PM_PRESENT (1ULL << 63) + +/* + * kernel page flags + */ + +#define KPF_BYTES 8 +#define PROC_KPAGEFLAGS "/proc/kpageflags" +#define PROC_KPAGECOUNT "/proc/kpagecount" +#define PROC_KPAGECGROUP "/proc/kpagecgroup" + +#define SYS_KERNEL_MM_PAGE_IDLE "/sys/kernel/mm/page_idle/bitmap" + +/* [32-] kernel hacking assistances */ +#define KPF_RESERVED 32 +#define KPF_MLOCKED 33 +#define KPF_MAPPEDTODISK 34 +#define KPF_PRIVATE 35 +#define KPF_PRIVATE_2 36 +#define KPF_OWNER_PRIVATE 37 +#define KPF_ARCH 38 +#define KPF_UNCACHED 39 +#define KPF_SOFTDIRTY 40 +#define KPF_ARCH_2 41 + +/* [47-] take some arbitrary free slots for expanding overloaded flags + * not part of kernel API + */ +#define KPF_ANON_EXCLUSIVE 47 +#define KPF_READAHEAD 48 +#define KPF_SLOB_FREE 49 +#define KPF_SLUB_FROZEN 50 +#define KPF_SLUB_DEBUG 51 +#define KPF_FILE 61 +#define KPF_SWAP 62 +#define KPF_MMAP_EXCLUSIVE 63 + +#define KPF_ALL_BITS ((uint64_t)~0ULL) +#define KPF_HACKERS_BITS (0xffffULL << 32) +#define KPF_OVERLOADED_BITS (0xffffULL << 48) +#define BIT(name) (1ULL << KPF_##name) +#define BITS_COMPOUND (BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL)) + +static const char * const page_flag_names[] = { + [KPF_LOCKED] = "L:locked", + [KPF_ERROR] = "E:error", + [KPF_REFERENCED] = "R:referenced", + [KPF_UPTODATE] = "U:uptodate", + [KPF_DIRTY] = "D:dirty", + [KPF_LRU] = "l:lru", + [KPF_ACTIVE] = "A:active", + [KPF_SLAB] = "S:slab", + [KPF_WRITEBACK] = "W:writeback", + [KPF_RECLAIM] = "I:reclaim", + [KPF_BUDDY] = "B:buddy", + + [KPF_MMAP] = "M:mmap", + [KPF_ANON] = "a:anonymous", + [KPF_SWAPCACHE] = "s:swapcache", + [KPF_SWAPBACKED] = "b:swapbacked", + [KPF_COMPOUND_HEAD] = "H:compound_head", + [KPF_COMPOUND_TAIL] = "T:compound_tail", + [KPF_HUGE] = "G:huge", + [KPF_UNEVICTABLE] = "u:unevictable", + [KPF_HWPOISON] = "X:hwpoison", + [KPF_NOPAGE] = "n:nopage", + [KPF_KSM] = "x:ksm", + [KPF_THP] = "t:thp", + [KPF_OFFLINE] = "o:offline", + [KPF_PGTABLE] = "g:pgtable", + [KPF_ZERO_PAGE] = "z:zero_page", + [KPF_IDLE] = "i:idle_page", + + [KPF_RESERVED] = "r:reserved", + [KPF_MLOCKED] = "m:mlocked", + [KPF_MAPPEDTODISK] = "d:mappedtodisk", + [KPF_PRIVATE] = "P:private", + [KPF_PRIVATE_2] = "p:private_2", + [KPF_OWNER_PRIVATE] = "O:owner_private", + [KPF_ARCH] = "h:arch", + [KPF_UNCACHED] = "c:uncached", + [KPF_SOFTDIRTY] = "f:softdirty", + [KPF_ARCH_2] = "H:arch_2", + + [KPF_ANON_EXCLUSIVE] = "d:anon_exclusive", + [KPF_READAHEAD] = "I:readahead", + [KPF_SLOB_FREE] = "P:slob_free", + [KPF_SLUB_FROZEN] = "A:slub_frozen", + [KPF_SLUB_DEBUG] = "E:slub_debug", + + [KPF_FILE] = "F:file", + [KPF_SWAP] = "w:swap", + [KPF_MMAP_EXCLUSIVE] = "1:mmap_exclusive", +}; + + +/* + * data structures + */ + +static int opt_raw; /* for kernel developers */ +static int opt_list; /* list pages (in ranges) */ +static int opt_mark_idle; /* set accessed bit */ +static int opt_no_summary; /* don't show summary */ +static pid_t opt_pid; /* process to walk */ +const char *opt_file; /* file or directory path */ +static uint64_t opt_cgroup; /* cgroup inode */ +static int opt_list_cgroup;/* list page cgroup */ +static int opt_list_mapcnt;/* list page map count */ +static const char *opt_kpageflags;/* kpageflags file to parse */ + +#define MAX_ADDR_RANGES 1024 +static int nr_addr_ranges; +static unsigned long opt_offset[MAX_ADDR_RANGES]; +static unsigned long opt_size[MAX_ADDR_RANGES]; + +#define MAX_VMAS 10240 +static int nr_vmas; +static unsigned long pg_start[MAX_VMAS]; +static unsigned long pg_end[MAX_VMAS]; + +#define MAX_BIT_FILTERS 64 +static int nr_bit_filters; +static uint64_t opt_mask[MAX_BIT_FILTERS]; +static uint64_t opt_bits[MAX_BIT_FILTERS]; + +static int page_size; + +static int pagemap_fd; +static int kpageflags_fd; +static int kpagecount_fd = -1; +static int kpagecgroup_fd = -1; +static int page_idle_fd = -1; + +static int opt_hwpoison; +static int opt_unpoison; + +static const char *hwpoison_debug_fs; +static int hwpoison_inject_fd; +static int hwpoison_forget_fd; + +#define HASH_SHIFT 13 +#define HASH_SIZE (1 << HASH_SHIFT) +#define HASH_MASK (HASH_SIZE - 1) +#define HASH_KEY(flags) (flags & HASH_MASK) + +static unsigned long total_pages; +static unsigned long nr_pages[HASH_SIZE]; +static uint64_t page_flags[HASH_SIZE]; + + +/* + * helper functions + */ + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +#define min_t(type, x, y) ({ \ + type __min1 = (x); \ + type __min2 = (y); \ + __min1 < __min2 ? __min1 : __min2; }) + +#define max_t(type, x, y) ({ \ + type __max1 = (x); \ + type __max2 = (y); \ + __max1 > __max2 ? __max1 : __max2; }) + +static unsigned long pages2mb(unsigned long pages) +{ + return (pages * page_size) >> 20; +} + +static void fatal(const char *x, ...) +{ + va_list ap; + + va_start(ap, x); + vfprintf(stderr, x, ap); + va_end(ap); + exit(EXIT_FAILURE); +} + +static int checked_open(const char *pathname, int flags) +{ + int fd = open(pathname, flags); + + if (fd < 0) { + perror(pathname); + exit(EXIT_FAILURE); + } + + return fd; +} + +/* + * pagemap/kpageflags routines + */ + +static unsigned long do_u64_read(int fd, const char *name, + uint64_t *buf, + unsigned long index, + unsigned long count) +{ + long bytes; + + if (index > ULONG_MAX / 8) + fatal("index overflow: %lu\n", index); + + bytes = pread(fd, buf, count * 8, (off_t)index * 8); + if (bytes < 0) { + perror(name); + exit(EXIT_FAILURE); + } + if (bytes % 8) + fatal("partial read: %lu bytes\n", bytes); + + return bytes / 8; +} + +static unsigned long kpageflags_read(uint64_t *buf, + unsigned long index, + unsigned long pages) +{ + return do_u64_read(kpageflags_fd, opt_kpageflags, buf, index, pages); +} + +static unsigned long kpagecgroup_read(uint64_t *buf, + unsigned long index, + unsigned long pages) +{ + if (kpagecgroup_fd < 0) + return pages; + + return do_u64_read(kpagecgroup_fd, opt_kpageflags, buf, index, pages); +} + +static unsigned long kpagecount_read(uint64_t *buf, + unsigned long index, + unsigned long pages) +{ + return kpagecount_fd < 0 ? pages : + do_u64_read(kpagecount_fd, PROC_KPAGECOUNT, + buf, index, pages); +} + +static unsigned long pagemap_read(uint64_t *buf, + unsigned long index, + unsigned long pages) +{ + return do_u64_read(pagemap_fd, "/proc/pid/pagemap", buf, index, pages); +} + +static unsigned long pagemap_pfn(uint64_t val) +{ + unsigned long pfn; + + if (val & PM_PRESENT) + pfn = PM_PFRAME(val); + else + pfn = 0; + + return pfn; +} + +static unsigned long pagemap_swap_offset(uint64_t val) +{ + return val & PM_SWAP ? PM_SWAP_OFFSET(val) : 0; +} + +/* + * page flag names + */ + +static char *page_flag_name(uint64_t flags) +{ + static char buf[65]; + int present; + size_t i, j; + + for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) { + present = (flags >> i) & 1; + if (!page_flag_names[i]) { + if (present) + fatal("unknown flag bit %d\n", i); + continue; + } + buf[j++] = present ? page_flag_names[i][0] : '_'; + } + + return buf; +} + +static char *page_flag_longname(uint64_t flags) +{ + static char buf[1024]; + size_t i, n; + + for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) { + if (!page_flag_names[i]) + continue; + if ((flags >> i) & 1) + n += snprintf(buf + n, sizeof(buf) - n, "%s,", + page_flag_names[i] + 2); + } + if (n) + n--; + buf[n] = '\0'; + + return buf; +} + + +/* + * page list and summary + */ + +static void show_page_range(unsigned long voffset, unsigned long offset, + unsigned long size, uint64_t flags, + uint64_t cgroup, uint64_t mapcnt) +{ + static uint64_t flags0; + static uint64_t cgroup0; + static uint64_t mapcnt0; + static unsigned long voff; + static unsigned long index; + static unsigned long count; + + if (flags == flags0 && cgroup == cgroup0 && mapcnt == mapcnt0 && + offset == index + count && size && voffset == voff + count) { + count += size; + return; + } + + if (count) { + if (opt_pid) + printf("%lx\t", voff); + if (opt_file) + printf("%lx\t", voff); + if (opt_list_cgroup) + printf("@%llu\t", (unsigned long long)cgroup0); + if (opt_list_mapcnt) + printf("%lu\t", mapcnt0); + printf("%lx\t%lx\t%s\n", + index, count, page_flag_name(flags0)); + } + + flags0 = flags; + cgroup0 = cgroup; + mapcnt0 = mapcnt; + index = offset; + voff = voffset; + count = size; +} + +static void flush_page_range(void) +{ + show_page_range(0, 0, 0, 0, 0, 0); +} + +static void show_page(unsigned long voffset, unsigned long offset, + uint64_t flags, uint64_t cgroup, uint64_t mapcnt) +{ + if (opt_pid) + printf("%lx\t", voffset); + if (opt_file) + printf("%lx\t", voffset); + if (opt_list_cgroup) + printf("@%llu\t", (unsigned long long)cgroup); + if (opt_list_mapcnt) + printf("%lu\t", mapcnt); + + printf("%lx\t%s\n", offset, page_flag_name(flags)); +} + +static void show_summary(void) +{ + size_t i; + + printf(" flags\tpage-count MB" + " symbolic-flags\t\t\tlong-symbolic-flags\n"); + + for (i = 0; i < ARRAY_SIZE(nr_pages); i++) { + if (nr_pages[i]) + printf("0x%016llx\t%10lu %8lu %s\t%s\n", + (unsigned long long)page_flags[i], + nr_pages[i], + pages2mb(nr_pages[i]), + page_flag_name(page_flags[i]), + page_flag_longname(page_flags[i])); + } + + printf(" total\t%10lu %8lu\n", + total_pages, pages2mb(total_pages)); +} + + +/* + * page flag filters + */ + +static int bit_mask_ok(uint64_t flags) +{ + int i; + + for (i = 0; i < nr_bit_filters; i++) { + if (opt_bits[i] == KPF_ALL_BITS) { + if ((flags & opt_mask[i]) == 0) + return 0; + } else { + if ((flags & opt_mask[i]) != opt_bits[i]) + return 0; + } + } + + return 1; +} + +static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme) +{ + /* Anonymous pages overload PG_mappedtodisk */ + if ((flags & BIT(ANON)) && (flags & BIT(MAPPEDTODISK))) + flags ^= BIT(MAPPEDTODISK) | BIT(ANON_EXCLUSIVE); + + /* SLOB/SLUB overload several page flags */ + if (flags & BIT(SLAB)) { + if (flags & BIT(PRIVATE)) + flags ^= BIT(PRIVATE) | BIT(SLOB_FREE); + if (flags & BIT(ACTIVE)) + flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN); + if (flags & BIT(ERROR)) + flags ^= BIT(ERROR) | BIT(SLUB_DEBUG); + } + + /* PG_reclaim is overloaded as PG_readahead in the read path */ + if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM)) + flags ^= BIT(RECLAIM) | BIT(READAHEAD); + + if (pme & PM_SOFT_DIRTY) + flags |= BIT(SOFTDIRTY); + if (pme & PM_FILE) + flags |= BIT(FILE); + if (pme & PM_SWAP) + flags |= BIT(SWAP); + if (pme & PM_MMAP_EXCLUSIVE) + flags |= BIT(MMAP_EXCLUSIVE); + + return flags; +} + +static uint64_t well_known_flags(uint64_t flags) +{ + /* hide flags intended only for kernel hacker */ + flags &= ~KPF_HACKERS_BITS; + + /* hide non-hugeTLB compound pages */ + if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE))) + flags &= ~BITS_COMPOUND; + + return flags; +} + +static uint64_t kpageflags_flags(uint64_t flags, uint64_t pme) +{ + if (opt_raw) + flags = expand_overloaded_flags(flags, pme); + else + flags = well_known_flags(flags); + + return flags; +} + +/* + * page actions + */ + +static void prepare_hwpoison_fd(void) +{ + char buf[MAX_PATH + 1]; + + hwpoison_debug_fs = debugfs__mount(); + if (!hwpoison_debug_fs) { + perror("mount debugfs"); + exit(EXIT_FAILURE); + } + + if (opt_hwpoison && !hwpoison_inject_fd) { + snprintf(buf, MAX_PATH, "%s/hwpoison/corrupt-pfn", + hwpoison_debug_fs); + hwpoison_inject_fd = checked_open(buf, O_WRONLY); + } + + if (opt_unpoison && !hwpoison_forget_fd) { + snprintf(buf, MAX_PATH, "%s/hwpoison/unpoison-pfn", + hwpoison_debug_fs); + hwpoison_forget_fd = checked_open(buf, O_WRONLY); + } +} + +static int hwpoison_page(unsigned long offset) +{ + char buf[100]; + int len; + + len = sprintf(buf, "0x%lx\n", offset); + len = write(hwpoison_inject_fd, buf, len); + if (len < 0) { + perror("hwpoison inject"); + return len; + } + return 0; +} + +static int unpoison_page(unsigned long offset) +{ + char buf[100]; + int len; + + len = sprintf(buf, "0x%lx\n", offset); + len = write(hwpoison_forget_fd, buf, len); + if (len < 0) { + perror("hwpoison forget"); + return len; + } + return 0; +} + +static int mark_page_idle(unsigned long offset) +{ + static unsigned long off; + static uint64_t buf; + int len; + + if ((offset / 64 == off / 64) || buf == 0) { + buf |= 1UL << (offset % 64); + off = offset; + return 0; + } + + len = pwrite(page_idle_fd, &buf, 8, 8 * (off / 64)); + if (len < 0) { + perror("mark page idle"); + return len; + } + + buf = 1UL << (offset % 64); + off = offset; + + return 0; +} + +/* + * page frame walker + */ + +static size_t hash_slot(uint64_t flags) +{ + size_t k = HASH_KEY(flags); + size_t i; + + /* Explicitly reserve slot 0 for flags 0: the following logic + * cannot distinguish an unoccupied slot from slot (flags==0). + */ + if (flags == 0) + return 0; + + /* search through the remaining (HASH_SIZE-1) slots */ + for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) { + if (!k || k >= ARRAY_SIZE(page_flags)) + k = 1; + if (page_flags[k] == 0) { + page_flags[k] = flags; + return k; + } + if (page_flags[k] == flags) + return k; + } + + fatal("hash table full: bump up HASH_SHIFT?\n"); + exit(EXIT_FAILURE); +} + +static void add_page(unsigned long voffset, unsigned long offset, + uint64_t flags, uint64_t cgroup, uint64_t mapcnt, + uint64_t pme) +{ + flags = kpageflags_flags(flags, pme); + + if (!bit_mask_ok(flags)) + return; + + if (opt_cgroup && cgroup != (uint64_t)opt_cgroup) + return; + + if (opt_hwpoison) + hwpoison_page(offset); + if (opt_unpoison) + unpoison_page(offset); + + if (opt_mark_idle) + mark_page_idle(offset); + + if (opt_list == 1) + show_page_range(voffset, offset, 1, flags, cgroup, mapcnt); + else if (opt_list == 2) + show_page(voffset, offset, flags, cgroup, mapcnt); + + nr_pages[hash_slot(flags)]++; + total_pages++; +} + +#define KPAGEFLAGS_BATCH (64 << 10) /* 64k pages */ +static void walk_pfn(unsigned long voffset, + unsigned long index, + unsigned long count, + uint64_t pme) +{ + uint64_t buf[KPAGEFLAGS_BATCH]; + uint64_t cgi[KPAGEFLAGS_BATCH]; + uint64_t cnt[KPAGEFLAGS_BATCH]; + unsigned long batch; + unsigned long pages; + unsigned long i; + + /* + * kpagecgroup_read() reads only if kpagecgroup were opened, but + * /proc/kpagecgroup might even not exist, so it's better to fill + * them with zeros here. + */ + if (count == 1) + cgi[0] = 0; + else + memset(cgi, 0, sizeof cgi); + + while (count) { + batch = min_t(unsigned long, count, KPAGEFLAGS_BATCH); + pages = kpageflags_read(buf, index, batch); + if (pages == 0) + break; + + if (kpagecgroup_read(cgi, index, pages) != pages) + fatal("kpagecgroup returned fewer pages than expected"); + + if (kpagecount_read(cnt, index, pages) != pages) + fatal("kpagecount returned fewer pages than expected"); + + for (i = 0; i < pages; i++) + add_page(voffset + i, index + i, + buf[i], cgi[i], cnt[i], pme); + + index += pages; + count -= pages; + } +} + +static void walk_swap(unsigned long voffset, uint64_t pme) +{ + uint64_t flags = kpageflags_flags(0, pme); + + if (!bit_mask_ok(flags)) + return; + + if (opt_cgroup) + return; + + if (opt_list == 1) + show_page_range(voffset, pagemap_swap_offset(pme), + 1, flags, 0, 0); + else if (opt_list == 2) + show_page(voffset, pagemap_swap_offset(pme), flags, 0, 0); + + nr_pages[hash_slot(flags)]++; + total_pages++; +} + +#define PAGEMAP_BATCH (64 << 10) +static void walk_vma(unsigned long index, unsigned long count) +{ + uint64_t buf[PAGEMAP_BATCH]; + unsigned long batch; + unsigned long pages; + unsigned long pfn; + unsigned long i; + + while (count) { + batch = min_t(unsigned long, count, PAGEMAP_BATCH); + pages = pagemap_read(buf, index, batch); + if (pages == 0) + break; + + for (i = 0; i < pages; i++) { + pfn = pagemap_pfn(buf[i]); + if (pfn) + walk_pfn(index + i, pfn, 1, buf[i]); + if (buf[i] & PM_SWAP) + walk_swap(index + i, buf[i]); + } + + index += pages; + count -= pages; + } +} + +static void walk_task(unsigned long index, unsigned long count) +{ + const unsigned long end = index + count; + unsigned long start; + int i = 0; + + while (index < end) { + + while (pg_end[i] <= index) + if (++i >= nr_vmas) + return; + if (pg_start[i] >= end) + return; + + start = max_t(unsigned long, pg_start[i], index); + index = min_t(unsigned long, pg_end[i], end); + + assert(start < index); + walk_vma(start, index - start); + } +} + +static void add_addr_range(unsigned long offset, unsigned long size) +{ + if (nr_addr_ranges >= MAX_ADDR_RANGES) + fatal("too many addr ranges\n"); + + opt_offset[nr_addr_ranges] = offset; + opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset); + nr_addr_ranges++; +} + +static void walk_addr_ranges(void) +{ + int i; + + kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY); + + if (!nr_addr_ranges) + add_addr_range(0, ULONG_MAX); + + for (i = 0; i < nr_addr_ranges; i++) + if (!opt_pid) + walk_pfn(opt_offset[i], opt_offset[i], opt_size[i], 0); + else + walk_task(opt_offset[i], opt_size[i]); + + if (opt_mark_idle) + mark_page_idle(0); + + close(kpageflags_fd); +} + + +/* + * user interface + */ + +static const char *page_flag_type(uint64_t flag) +{ + if (flag & KPF_HACKERS_BITS) + return "(r)"; + if (flag & KPF_OVERLOADED_BITS) + return "(o)"; + return " "; +} + +static void usage(void) +{ + size_t i, j; + + printf( +"page-types [options]\n" +" -r|--raw Raw mode, for kernel developers\n" +" -d|--describe flags Describe flags\n" +" -a|--addr addr-spec Walk a range of pages\n" +" -b|--bits bits-spec Walk pages with specified bits\n" +" -c|--cgroup path|@inode Walk pages within memory cgroup\n" +" -p|--pid pid Walk process address space\n" +" -f|--file filename Walk file address space\n" +" -i|--mark-idle Mark pages idle\n" +" -l|--list Show page details in ranges\n" +" -L|--list-each Show page details one by one\n" +" -C|--list-cgroup Show cgroup inode for pages\n" +" -M|--list-mapcnt Show page map count\n" +" -N|--no-summary Don't show summary info\n" +" -X|--hwpoison hwpoison pages\n" +" -x|--unpoison unpoison pages\n" +" -F|--kpageflags filename kpageflags file to parse\n" +" -h|--help Show this usage message\n" +"flags:\n" +" 0x10 bitfield format, e.g.\n" +" anon bit-name, e.g.\n" +" 0x10,anon comma-separated list, e.g.\n" +"addr-spec:\n" +" N one page at offset N (unit: pages)\n" +" N+M pages range from N to N+M-1\n" +" N,M pages range from N to M-1\n" +" N, pages range from N to end\n" +" ,M pages range from 0 to M-1\n" +"bits-spec:\n" +" bit1,bit2 (flags & (bit1|bit2)) != 0\n" +" bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n" +" bit1,~bit2 (flags & (bit1|bit2)) == bit1\n" +" =bit1,bit2 flags == (bit1|bit2)\n" +"bit-names:\n" + ); + + for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) { + if (!page_flag_names[i]) + continue; + printf("%16s%s", page_flag_names[i] + 2, + page_flag_type(1ULL << i)); + if (++j > 3) { + j = 0; + putchar('\n'); + } + } + printf("\n " + "(r) raw mode bits (o) overloaded bits\n"); +} + +static unsigned long long parse_number(const char *str) +{ + unsigned long long n; + + n = strtoll(str, NULL, 0); + + if (n == 0 && str[0] != '0') + fatal("invalid name or number: %s\n", str); + + return n; +} + +static void parse_pid(const char *str) +{ + FILE *file; + char buf[5000]; + + opt_pid = parse_number(str); + + sprintf(buf, "/proc/%d/pagemap", opt_pid); + pagemap_fd = checked_open(buf, O_RDONLY); + + sprintf(buf, "/proc/%d/maps", opt_pid); + file = fopen(buf, "r"); + if (!file) { + perror(buf); + exit(EXIT_FAILURE); + } + + while (fgets(buf, sizeof(buf), file) != NULL) { + unsigned long vm_start; + unsigned long vm_end; + unsigned long long pgoff; + int major, minor; + char r, w, x, s; + unsigned long ino; + int n; + + n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu", + &vm_start, + &vm_end, + &r, &w, &x, &s, + &pgoff, + &major, &minor, + &ino); + if (n < 10) { + fprintf(stderr, "unexpected line: %s\n", buf); + continue; + } + pg_start[nr_vmas] = vm_start / page_size; + pg_end[nr_vmas] = vm_end / page_size; + if (++nr_vmas >= MAX_VMAS) { + fprintf(stderr, "too many VMAs\n"); + break; + } + } + fclose(file); +} + +static void show_file(const char *name, const struct stat *st) +{ + unsigned long long size = st->st_size; + char atime[64], mtime[64]; + long now = time(NULL); + + printf("%s\tInode: %u\tSize: %llu (%llu pages)\n", + name, (unsigned)st->st_ino, + size, (size + page_size - 1) / page_size); + + strftime(atime, sizeof(atime), "%c", localtime(&st->st_atime)); + strftime(mtime, sizeof(mtime), "%c", localtime(&st->st_mtime)); + + printf("Modify: %s (%ld seconds ago)\nAccess: %s (%ld seconds ago)\n", + mtime, now - st->st_mtime, + atime, now - st->st_atime); +} + +static sigjmp_buf sigbus_jmp; + +static void * volatile sigbus_addr; + +static void sigbus_handler(int sig, siginfo_t *info, void *ucontex) +{ + (void)sig; + (void)ucontex; + sigbus_addr = info ? info->si_addr : NULL; + siglongjmp(sigbus_jmp, 1); +} + +static struct sigaction sigbus_action = { + .sa_sigaction = sigbus_handler, + .sa_flags = SA_SIGINFO, +}; + +static void walk_file_range(const char *name, int fd, + unsigned long off, unsigned long end) +{ + uint8_t vec[PAGEMAP_BATCH]; + uint64_t buf[PAGEMAP_BATCH], flags; + uint64_t cgroup = 0; + uint64_t mapcnt = 0; + unsigned long nr_pages, pfn, i; + ssize_t len; + void *ptr; + int first = 1; + + for (; off < end; off += len) { + nr_pages = (end - off + page_size - 1) / page_size; + if (nr_pages > PAGEMAP_BATCH) + nr_pages = PAGEMAP_BATCH; + len = nr_pages * page_size; + + ptr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, off); + if (ptr == MAP_FAILED) + fatal("mmap failed: %s", name); + + /* determine cached pages */ + if (mincore(ptr, len, vec)) + fatal("mincore failed: %s", name); + + /* turn off readahead */ + if (madvise(ptr, len, MADV_RANDOM)) + fatal("madvice failed: %s", name); + + if (sigsetjmp(sigbus_jmp, 1)) { + end = off + sigbus_addr ? sigbus_addr - ptr : 0; + fprintf(stderr, "got sigbus at offset %lld: %s\n", + (long long)end, name); + goto got_sigbus; + } + + /* populate ptes */ + for (i = 0; i < nr_pages ; i++) { + if (vec[i] & 1) + (void)*(volatile int *)(ptr + i * page_size); + } +got_sigbus: + + /* turn off harvesting reference bits */ + if (madvise(ptr, len, MADV_SEQUENTIAL)) + fatal("madvice failed: %s", name); + + if (pagemap_read(buf, (unsigned long)ptr / page_size, + nr_pages) != nr_pages) + fatal("cannot read pagemap"); + + munmap(ptr, len); + + for (i = 0; i < nr_pages; i++) { + pfn = pagemap_pfn(buf[i]); + if (!pfn) + continue; + if (!kpageflags_read(&flags, pfn, 1)) + continue; + if (!kpagecgroup_read(&cgroup, pfn, 1)) + fatal("kpagecgroup_read failed"); + if (!kpagecount_read(&mapcnt, pfn, 1)) + fatal("kpagecount_read failed"); + if (first && opt_list) { + first = 0; + flush_page_range(); + } + add_page(off / page_size + i, pfn, + flags, cgroup, mapcnt, buf[i]); + } + } +} + +static void walk_file(const char *name, const struct stat *st) +{ + int i; + int fd; + + fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW); + + if (!nr_addr_ranges) + add_addr_range(0, st->st_size / page_size); + + for (i = 0; i < nr_addr_ranges; i++) + walk_file_range(name, fd, opt_offset[i] * page_size, + (opt_offset[i] + opt_size[i]) * page_size); + + close(fd); +} + +int walk_tree(const char *name, const struct stat *st, int type, struct FTW *f) +{ + (void)f; + switch (type) { + case FTW_F: + if (S_ISREG(st->st_mode)) + walk_file(name, st); + break; + case FTW_DNR: + fprintf(stderr, "cannot read dir: %s\n", name); + break; + } + return 0; +} + +struct stat st; + +static void walk_page_cache(void) +{ + kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY); + pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY); + sigaction(SIGBUS, &sigbus_action, NULL); + + if (stat(opt_file, &st)) + fatal("stat failed: %s\n", opt_file); + + if (S_ISREG(st.st_mode)) { + walk_file(opt_file, &st); + } else if (S_ISDIR(st.st_mode)) { + /* do not follow symlinks and mountpoints */ + if (nftw(opt_file, walk_tree, 64, FTW_MOUNT | FTW_PHYS) < 0) + fatal("nftw failed: %s\n", opt_file); + } else + fatal("unhandled file type: %s\n", opt_file); + + close(kpageflags_fd); + close(pagemap_fd); + signal(SIGBUS, SIG_DFL); +} + +static void parse_file(const char *name) +{ + opt_file = name; +} + +static void parse_cgroup(const char *path) +{ + if (path[0] == '@') { + opt_cgroup = parse_number(path + 1); + return; + } + + struct stat st; + + if (stat(path, &st)) + fatal("stat failed: %s: %m\n", path); + + if (!S_ISDIR(st.st_mode)) + fatal("cgroup supposed to be a directory: %s\n", path); + + opt_cgroup = st.st_ino; +} + +static void parse_addr_range(const char *optarg) +{ + unsigned long offset; + unsigned long size; + char *p; + + p = strchr(optarg, ','); + if (!p) + p = strchr(optarg, '+'); + + if (p == optarg) { + offset = 0; + size = parse_number(p + 1); + } else if (p) { + offset = parse_number(optarg); + if (p[1] == '\0') + size = ULONG_MAX; + else { + size = parse_number(p + 1); + if (*p == ',') { + if (size < offset) + fatal("invalid range: %lu,%lu\n", + offset, size); + size -= offset; + } + } + } else { + offset = parse_number(optarg); + size = 1; + } + + add_addr_range(offset, size); +} + +static void add_bits_filter(uint64_t mask, uint64_t bits) +{ + if (nr_bit_filters >= MAX_BIT_FILTERS) + fatal("too much bit filters\n"); + + opt_mask[nr_bit_filters] = mask; + opt_bits[nr_bit_filters] = bits; + nr_bit_filters++; +} + +static uint64_t parse_flag_name(const char *str, int len) +{ + size_t i; + + if (!*str || !len) + return 0; + + if (len <= 8 && !strncmp(str, "compound", len)) + return BITS_COMPOUND; + + for (i = 0; i < ARRAY_SIZE(page_flag_names); i++) { + if (!page_flag_names[i]) + continue; + if (!strncmp(str, page_flag_names[i] + 2, len)) + return 1ULL << i; + } + + return parse_number(str); +} + +static uint64_t parse_flag_names(const char *str, int all) +{ + const char *p = str; + uint64_t flags = 0; + + while (1) { + if (*p == ',' || *p == '=' || *p == '\0') { + if ((*str != '~') || (*str == '~' && all && *++str)) + flags |= parse_flag_name(str, p - str); + if (*p != ',') + break; + str = p + 1; + } + p++; + } + + return flags; +} + +static void parse_bits_mask(const char *optarg) +{ + uint64_t mask; + uint64_t bits; + const char *p; + + p = strchr(optarg, '='); + if (p == optarg) { + mask = KPF_ALL_BITS; + bits = parse_flag_names(p + 1, 0); + } else if (p) { + mask = parse_flag_names(optarg, 0); + bits = parse_flag_names(p + 1, 0); + } else if (strchr(optarg, '~')) { + mask = parse_flag_names(optarg, 1); + bits = parse_flag_names(optarg, 0); + } else { + mask = parse_flag_names(optarg, 0); + bits = KPF_ALL_BITS; + } + + add_bits_filter(mask, bits); +} + +static void parse_kpageflags(const char *name) +{ + opt_kpageflags = name; +} + +static void describe_flags(const char *optarg) +{ + uint64_t flags = parse_flag_names(optarg, 0); + + printf("0x%016llx\t%s\t%s\n", + (unsigned long long)flags, + page_flag_name(flags), + page_flag_longname(flags)); +} + +static const struct option opts[] = { + { "raw" , 0, NULL, 'r' }, + { "pid" , 1, NULL, 'p' }, + { "file" , 1, NULL, 'f' }, + { "addr" , 1, NULL, 'a' }, + { "bits" , 1, NULL, 'b' }, + { "cgroup" , 1, NULL, 'c' }, + { "describe" , 1, NULL, 'd' }, + { "mark-idle" , 0, NULL, 'i' }, + { "list" , 0, NULL, 'l' }, + { "list-each" , 0, NULL, 'L' }, + { "list-cgroup", 0, NULL, 'C' }, + { "list-mapcnt", 0, NULL, 'M' }, + { "no-summary", 0, NULL, 'N' }, + { "hwpoison" , 0, NULL, 'X' }, + { "unpoison" , 0, NULL, 'x' }, + { "kpageflags", 0, NULL, 'F' }, + { "help" , 0, NULL, 'h' }, + { NULL , 0, NULL, 0 } +}; + +int main(int argc, char *argv[]) +{ + int c; + + page_size = getpagesize(); + + while ((c = getopt_long(argc, argv, + "rp:f:a:b:d:c:CilLMNXxF:h", + opts, NULL)) != -1) { + switch (c) { + case 'r': + opt_raw = 1; + break; + case 'p': + parse_pid(optarg); + break; + case 'f': + parse_file(optarg); + break; + case 'a': + parse_addr_range(optarg); + break; + case 'b': + parse_bits_mask(optarg); + break; + case 'c': + parse_cgroup(optarg); + break; + case 'C': + opt_list_cgroup = 1; + break; + case 'd': + describe_flags(optarg); + exit(0); + case 'i': + opt_mark_idle = 1; + break; + case 'l': + opt_list = 1; + break; + case 'L': + opt_list = 2; + break; + case 'M': + opt_list_mapcnt = 1; + break; + case 'N': + opt_no_summary = 1; + break; + case 'X': + opt_hwpoison = 1; + prepare_hwpoison_fd(); + break; + case 'x': + opt_unpoison = 1; + prepare_hwpoison_fd(); + break; + case 'F': + parse_kpageflags(optarg); + break; + case 'h': + usage(); + exit(0); + default: + usage(); + exit(1); + } + } + + if (!opt_kpageflags) + opt_kpageflags = PROC_KPAGEFLAGS; + + if (opt_cgroup || opt_list_cgroup) + kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY); + + if (opt_list && opt_list_mapcnt) + kpagecount_fd = checked_open(PROC_KPAGECOUNT, O_RDONLY); + + if (opt_mark_idle) + page_idle_fd = checked_open(SYS_KERNEL_MM_PAGE_IDLE, O_RDWR); + + if (opt_list && opt_pid) + printf("voffset\t"); + if (opt_list && opt_file) + printf("foffset\t"); + if (opt_list && opt_list_cgroup) + printf("cgroup\t"); + if (opt_list && opt_list_mapcnt) + printf("map-cnt\t"); + + if (opt_list == 1) + printf("offset\tlen\tflags\n"); + if (opt_list == 2) + printf("offset\tflags\n"); + + if (opt_file) + walk_page_cache(); + else + walk_addr_ranges(); + + if (opt_list == 1) + flush_page_range(); + + if (opt_no_summary) + return 0; + + if (opt_list) + printf("\n\n"); + + if (opt_file) { + show_file(opt_file, &st); + printf("\n"); + } + + show_summary(); + + if (opt_list_mapcnt) + close(kpagecount_fd); + + if (page_idle_fd >= 0) + close(page_idle_fd); + + return 0; +} diff --git a/tools/mm/page_owner_sort.c b/tools/mm/page_owner_sort.c new file mode 100644 index 000000000000..7c2ac124cdc8 --- /dev/null +++ b/tools/mm/page_owner_sort.c @@ -0,0 +1,897 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * User-space helper to sort the output of /sys/kernel/debug/page_owner + * + * Example use: + * cat /sys/kernel/debug/page_owner > page_owner_full.txt + * ./page_owner_sort page_owner_full.txt sorted_page_owner.txt + * Or sort by total memory: + * ./page_owner_sort -m page_owner_full.txt sorted_page_owner.txt + * + * See Documentation/mm/page_owner.rst +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define bool int +#define true 1 +#define false 0 +#define TASK_COMM_LEN 16 + +struct block_list { + char *txt; + char *comm; // task command name + char *stacktrace; + __u64 ts_nsec; + __u64 free_ts_nsec; + int len; + int num; + int page_num; + pid_t pid; + pid_t tgid; + int allocator; +}; +enum FILTER_BIT { + FILTER_UNRELEASE = 1<<1, + FILTER_PID = 1<<2, + FILTER_TGID = 1<<3, + FILTER_COMM = 1<<4 +}; +enum CULL_BIT { + CULL_UNRELEASE = 1<<1, + CULL_PID = 1<<2, + CULL_TGID = 1<<3, + CULL_COMM = 1<<4, + CULL_STACKTRACE = 1<<5, + CULL_ALLOCATOR = 1<<6 +}; +enum ALLOCATOR_BIT { + ALLOCATOR_CMA = 1<<1, + ALLOCATOR_SLAB = 1<<2, + ALLOCATOR_VMALLOC = 1<<3, + ALLOCATOR_OTHERS = 1<<4 +}; +enum ARG_TYPE { + ARG_TXT, ARG_COMM, ARG_STACKTRACE, ARG_ALLOC_TS, ARG_FREE_TS, + ARG_CULL_TIME, ARG_PAGE_NUM, ARG_PID, ARG_TGID, ARG_UNKNOWN, ARG_FREE, + ARG_ALLOCATOR +}; +enum SORT_ORDER { + SORT_ASC = 1, + SORT_DESC = -1, +}; +struct filter_condition { + pid_t *pids; + pid_t *tgids; + char **comms; + int pids_size; + int tgids_size; + int comms_size; +}; +struct sort_condition { + int (**cmps)(const void *, const void *); + int *signs; + int size; +}; +static struct filter_condition fc; +static struct sort_condition sc; +static regex_t order_pattern; +static regex_t pid_pattern; +static regex_t tgid_pattern; +static regex_t comm_pattern; +static regex_t ts_nsec_pattern; +static regex_t free_ts_nsec_pattern; +static struct block_list *list; +static int list_size; +static int max_size; +static int cull; +static int filter; +static bool debug_on; + +static void set_single_cmp(int (*cmp)(const void *, const void *), int sign); + +int read_block(char *buf, char *ext_buf, int buf_size, FILE *fin) +{ + char *curr = buf, *const buf_end = buf + buf_size; + + while (buf_end - curr > 1 && fgets(curr, buf_end - curr, fin)) { + if (*curr == '\n') { /* empty line */ + return curr - buf; + } + if (!strncmp(curr, "PFN", 3)) { + strcpy(ext_buf, curr); + continue; + } + curr += strlen(curr); + } + + return -1; /* EOF or no space left in buf. */ +} + +static int compare_txt(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return strcmp(l1->txt, l2->txt); +} + +static int compare_stacktrace(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return strcmp(l1->stacktrace, l2->stacktrace); +} + +static int compare_num(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->num - l2->num; +} + +static int compare_page_num(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->page_num - l2->page_num; +} + +static int compare_pid(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->pid - l2->pid; +} + +static int compare_tgid(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->tgid - l2->tgid; +} + +static int compare_allocator(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->allocator - l2->allocator; +} + +static int compare_comm(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return strcmp(l1->comm, l2->comm); +} + +static int compare_ts(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->ts_nsec < l2->ts_nsec ? -1 : 1; +} + +static int compare_free_ts(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->free_ts_nsec < l2->free_ts_nsec ? -1 : 1; +} + +static int compare_release(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + if (!l1->free_ts_nsec && !l2->free_ts_nsec) + return 0; + if (l1->free_ts_nsec && l2->free_ts_nsec) + return 0; + return l1->free_ts_nsec ? 1 : -1; +} + +static int compare_cull_condition(const void *p1, const void *p2) +{ + if (cull == 0) + return compare_txt(p1, p2); + if ((cull & CULL_STACKTRACE) && compare_stacktrace(p1, p2)) + return compare_stacktrace(p1, p2); + if ((cull & CULL_PID) && compare_pid(p1, p2)) + return compare_pid(p1, p2); + if ((cull & CULL_TGID) && compare_tgid(p1, p2)) + return compare_tgid(p1, p2); + if ((cull & CULL_COMM) && compare_comm(p1, p2)) + return compare_comm(p1, p2); + if ((cull & CULL_UNRELEASE) && compare_release(p1, p2)) + return compare_release(p1, p2); + if ((cull & CULL_ALLOCATOR) && compare_allocator(p1, p2)) + return compare_allocator(p1, p2); + return 0; +} + +static int compare_sort_condition(const void *p1, const void *p2) +{ + int cmp = 0; + + for (int i = 0; i < sc.size; ++i) + if (cmp == 0) + cmp = sc.signs[i] * sc.cmps[i](p1, p2); + return cmp; +} + +static int search_pattern(regex_t *pattern, char *pattern_str, char *buf) +{ + int err, val_len; + regmatch_t pmatch[2]; + + err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL); + if (err != 0 || pmatch[1].rm_so == -1) { + if (debug_on) + fprintf(stderr, "no matching pattern in %s\n", buf); + return -1; + } + val_len = pmatch[1].rm_eo - pmatch[1].rm_so; + + memcpy(pattern_str, buf + pmatch[1].rm_so, val_len); + + return 0; +} + +static bool check_regcomp(regex_t *pattern, const char *regex) +{ + int err; + + err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE); + if (err != 0 || pattern->re_nsub != 1) { + fprintf(stderr, "Invalid pattern %s code %d\n", regex, err); + return false; + } + return true; +} + +static char **explode(char sep, const char *str, int *size) +{ + int count = 0, len = strlen(str); + int lastindex = -1, j = 0; + + for (int i = 0; i < len; i++) + if (str[i] == sep) + count++; + char **ret = calloc(++count, sizeof(char *)); + + for (int i = 0; i < len; i++) { + if (str[i] == sep) { + ret[j] = calloc(i - lastindex, sizeof(char)); + memcpy(ret[j++], str + lastindex + 1, i - lastindex - 1); + lastindex = i; + } + } + if (lastindex <= len - 1) { + ret[j] = calloc(len - lastindex, sizeof(char)); + memcpy(ret[j++], str + lastindex + 1, strlen(str) - 1 - lastindex); + } + *size = j; + return ret; +} + +static void free_explode(char **arr, int size) +{ + for (int i = 0; i < size; i++) + free(arr[i]); + free(arr); +} + +# define FIELD_BUFF 25 + +static int get_page_num(char *buf) +{ + int order_val; + char order_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&order_pattern, order_str, buf); + errno = 0; + order_val = strtol(order_str, &endptr, 10); + if (order_val > 64 || errno != 0 || endptr == order_str || *endptr != '\0') { + if (debug_on) + fprintf(stderr, "wrong order in follow buf:\n%s\n", buf); + return 0; + } + + return 1 << order_val; +} + +static pid_t get_pid(char *buf) +{ + pid_t pid; + char pid_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&pid_pattern, pid_str, buf); + errno = 0; + pid = strtol(pid_str, &endptr, 10); + if (errno != 0 || endptr == pid_str || *endptr != '\0') { + if (debug_on) + fprintf(stderr, "wrong/invalid pid in follow buf:\n%s\n", buf); + return -1; + } + + return pid; + +} + +static pid_t get_tgid(char *buf) +{ + pid_t tgid; + char tgid_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&tgid_pattern, tgid_str, buf); + errno = 0; + tgid = strtol(tgid_str, &endptr, 10); + if (errno != 0 || endptr == tgid_str || *endptr != '\0') { + if (debug_on) + fprintf(stderr, "wrong/invalid tgid in follow buf:\n%s\n", buf); + return -1; + } + + return tgid; + +} + +static __u64 get_ts_nsec(char *buf) +{ + __u64 ts_nsec; + char ts_nsec_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&ts_nsec_pattern, ts_nsec_str, buf); + errno = 0; + ts_nsec = strtoull(ts_nsec_str, &endptr, 10); + if (errno != 0 || endptr == ts_nsec_str || *endptr != '\0') { + if (debug_on) + fprintf(stderr, "wrong ts_nsec in follow buf:\n%s\n", buf); + return -1; + } + + return ts_nsec; +} + +static __u64 get_free_ts_nsec(char *buf) +{ + __u64 free_ts_nsec; + char free_ts_nsec_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&free_ts_nsec_pattern, free_ts_nsec_str, buf); + errno = 0; + free_ts_nsec = strtoull(free_ts_nsec_str, &endptr, 10); + if (errno != 0 || endptr == free_ts_nsec_str || *endptr != '\0') { + if (debug_on) + fprintf(stderr, "wrong free_ts_nsec in follow buf:\n%s\n", buf); + return -1; + } + + return free_ts_nsec; +} + +static char *get_comm(char *buf) +{ + char *comm_str = malloc(TASK_COMM_LEN); + + memset(comm_str, 0, TASK_COMM_LEN); + + search_pattern(&comm_pattern, comm_str, buf); + errno = 0; + if (errno != 0) { + if (debug_on) + fprintf(stderr, "wrong comm in follow buf:\n%s\n", buf); + return NULL; + } + + return comm_str; +} + +static int get_arg_type(const char *arg) +{ + if (!strcmp(arg, "pid") || !strcmp(arg, "p")) + return ARG_PID; + else if (!strcmp(arg, "tgid") || !strcmp(arg, "tg")) + return ARG_TGID; + else if (!strcmp(arg, "name") || !strcmp(arg, "n")) + return ARG_COMM; + else if (!strcmp(arg, "stacktrace") || !strcmp(arg, "st")) + return ARG_STACKTRACE; + else if (!strcmp(arg, "free") || !strcmp(arg, "f")) + return ARG_FREE; + else if (!strcmp(arg, "txt") || !strcmp(arg, "T")) + return ARG_TXT; + else if (!strcmp(arg, "free_ts") || !strcmp(arg, "ft")) + return ARG_FREE_TS; + else if (!strcmp(arg, "alloc_ts") || !strcmp(arg, "at")) + return ARG_ALLOC_TS; + else if (!strcmp(arg, "allocator") || !strcmp(arg, "ator")) + return ARG_ALLOCATOR; + else { + return ARG_UNKNOWN; + } +} + +static int get_allocator(const char *buf, const char *migrate_info) +{ + char *tmp, *first_line, *second_line; + int allocator = 0; + + if (strstr(migrate_info, "CMA")) + allocator |= ALLOCATOR_CMA; + if (strstr(migrate_info, "slab")) + allocator |= ALLOCATOR_SLAB; + tmp = strstr(buf, "__vmalloc_node_range"); + if (tmp) { + second_line = tmp; + while (*tmp != '\n') + tmp--; + tmp--; + while (*tmp != '\n') + tmp--; + first_line = ++tmp; + tmp = strstr(tmp, "alloc_pages"); + if (tmp && first_line <= tmp && tmp < second_line) + allocator |= ALLOCATOR_VMALLOC; + } + if (allocator == 0) + allocator = ALLOCATOR_OTHERS; + return allocator; +} + +static bool match_num_list(int num, int *list, int list_size) +{ + for (int i = 0; i < list_size; ++i) + if (list[i] == num) + return true; + return false; +} + +static bool match_str_list(const char *str, char **list, int list_size) +{ + for (int i = 0; i < list_size; ++i) + if (!strcmp(list[i], str)) + return true; + return false; +} + +static bool is_need(char *buf) +{ + __u64 ts_nsec, free_ts_nsec; + + ts_nsec = get_ts_nsec(buf); + free_ts_nsec = get_free_ts_nsec(buf); + + if ((filter & FILTER_UNRELEASE) && free_ts_nsec != 0 && ts_nsec < free_ts_nsec) + return false; + if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size)) + return false; + if ((filter & FILTER_TGID) && + !match_num_list(get_tgid(buf), fc.tgids, fc.tgids_size)) + return false; + + char *comm = get_comm(buf); + + if ((filter & FILTER_COMM) && + !match_str_list(comm, fc.comms, fc.comms_size)) { + free(comm); + return false; + } + free(comm); + return true; +} + +static bool add_list(char *buf, int len, char *ext_buf) +{ + if (list_size != 0 && + len == list[list_size-1].len && + memcmp(buf, list[list_size-1].txt, len) == 0) { + list[list_size-1].num++; + list[list_size-1].page_num += get_page_num(buf); + return true; + } + if (list_size == max_size) { + fprintf(stderr, "max_size too small??\n"); + return false; + } + if (!is_need(buf)) + return true; + list[list_size].pid = get_pid(buf); + list[list_size].tgid = get_tgid(buf); + list[list_size].comm = get_comm(buf); + list[list_size].txt = malloc(len+1); + if (!list[list_size].txt) { + fprintf(stderr, "Out of memory\n"); + return false; + } + memcpy(list[list_size].txt, buf, len); + list[list_size].txt[len] = 0; + list[list_size].len = len; + list[list_size].num = 1; + list[list_size].page_num = get_page_num(buf); + + list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: ""; + if (*list[list_size].stacktrace == '\n') + list[list_size].stacktrace++; + list[list_size].ts_nsec = get_ts_nsec(buf); + list[list_size].free_ts_nsec = get_free_ts_nsec(buf); + list[list_size].allocator = get_allocator(buf, ext_buf); + list_size++; + if (list_size % 1000 == 0) { + printf("loaded %d\r", list_size); + fflush(stdout); + } + return true; +} + +static bool parse_cull_args(const char *arg_str) +{ + int size = 0; + char **args = explode(',', arg_str, &size); + + for (int i = 0; i < size; ++i) { + int arg_type = get_arg_type(args[i]); + + if (arg_type == ARG_PID) + cull |= CULL_PID; + else if (arg_type == ARG_TGID) + cull |= CULL_TGID; + else if (arg_type == ARG_COMM) + cull |= CULL_COMM; + else if (arg_type == ARG_STACKTRACE) + cull |= CULL_STACKTRACE; + else if (arg_type == ARG_FREE) + cull |= CULL_UNRELEASE; + else if (arg_type == ARG_ALLOCATOR) + cull |= CULL_ALLOCATOR; + else { + free_explode(args, size); + return false; + } + } + free_explode(args, size); + if (sc.size == 0) + set_single_cmp(compare_num, SORT_DESC); + return true; +} + +static void set_single_cmp(int (*cmp)(const void *, const void *), int sign) +{ + if (sc.signs == NULL || sc.size < 1) + sc.signs = calloc(1, sizeof(int)); + sc.signs[0] = sign; + if (sc.cmps == NULL || sc.size < 1) + sc.cmps = calloc(1, sizeof(int *)); + sc.cmps[0] = cmp; + sc.size = 1; +} + +static bool parse_sort_args(const char *arg_str) +{ + int size = 0; + + if (sc.size != 0) { /* reset sort_condition */ + free(sc.signs); + free(sc.cmps); + size = 0; + } + + char **args = explode(',', arg_str, &size); + + sc.signs = calloc(size, sizeof(int)); + sc.cmps = calloc(size, sizeof(int *)); + for (int i = 0; i < size; ++i) { + int offset = 0; + + sc.signs[i] = SORT_ASC; + if (args[i][0] == '-' || args[i][0] == '+') { + if (args[i][0] == '-') + sc.signs[i] = SORT_DESC; + offset = 1; + } + + int arg_type = get_arg_type(args[i]+offset); + + if (arg_type == ARG_PID) + sc.cmps[i] = compare_pid; + else if (arg_type == ARG_TGID) + sc.cmps[i] = compare_tgid; + else if (arg_type == ARG_COMM) + sc.cmps[i] = compare_comm; + else if (arg_type == ARG_STACKTRACE) + sc.cmps[i] = compare_stacktrace; + else if (arg_type == ARG_ALLOC_TS) + sc.cmps[i] = compare_ts; + else if (arg_type == ARG_FREE_TS) + sc.cmps[i] = compare_free_ts; + else if (arg_type == ARG_TXT) + sc.cmps[i] = compare_txt; + else if (arg_type == ARG_ALLOCATOR) + sc.cmps[i] = compare_allocator; + else { + free_explode(args, size); + sc.size = 0; + return false; + } + } + sc.size = size; + free_explode(args, size); + return true; +} + +static int *parse_nums_list(char *arg_str, int *list_size) +{ + int size = 0; + char **args = explode(',', arg_str, &size); + int *list = calloc(size, sizeof(int)); + + errno = 0; + for (int i = 0; i < size; ++i) { + char *endptr = NULL; + + list[i] = strtol(args[i], &endptr, 10); + if (errno != 0 || endptr == args[i] || *endptr != '\0') { + free(list); + return NULL; + } + } + *list_size = size; + free_explode(args, size); + return list; +} + +static void print_allocator(FILE *out, int allocator) +{ + fprintf(out, "allocated by "); + if (allocator & ALLOCATOR_CMA) + fprintf(out, "CMA "); + if (allocator & ALLOCATOR_SLAB) + fprintf(out, "SLAB "); + if (allocator & ALLOCATOR_VMALLOC) + fprintf(out, "VMALLOC "); + if (allocator & ALLOCATOR_OTHERS) + fprintf(out, "OTHERS "); +} + +#define BUF_SIZE (128 * 1024) + +static void usage(void) +{ + printf("Usage: ./page_owner_sort [OPTIONS] \n" + "-m\t\tSort by total memory.\n" + "-s\t\tSort by the stack trace.\n" + "-t\t\tSort by times (default).\n" + "-p\t\tSort by pid.\n" + "-P\t\tSort by tgid.\n" + "-n\t\tSort by task command name.\n" + "-a\t\tSort by memory allocate time.\n" + "-r\t\tSort by memory release time.\n" + "-f\t\tFilter out the information of blocks whose memory has been released.\n" + "-d\t\tPrint debug information.\n" + "--pid \tSelect by pid. This selects the information of blocks whose process ID numbers appear in .\n" + "--tgid \tSelect by tgid. This selects the information of blocks whose Thread Group ID numbers appear in .\n" + "--name \n\t\tSelect by command name. This selects the information of blocks whose command name appears in .\n" + "--cull \tCull by user-defined rules. is a single argument in the form of a comma-separated list with some common fields predefined\n" + "--sort \tSpecify sort order as: [+|-]key[,[+|-]key[,...]]\n" + ); +} + +int main(int argc, char **argv) +{ + FILE *fin, *fout; + char *buf, *ext_buf; + int i, count; + struct stat st; + int opt; + struct option longopts[] = { + { "pid", required_argument, NULL, 1 }, + { "tgid", required_argument, NULL, 2 }, + { "name", required_argument, NULL, 3 }, + { "cull", required_argument, NULL, 4 }, + { "sort", required_argument, NULL, 5 }, + { 0, 0, 0, 0}, + }; + + while ((opt = getopt_long(argc, argv, "adfmnprstP", longopts, NULL)) != -1) + switch (opt) { + case 'a': + set_single_cmp(compare_ts, SORT_ASC); + break; + case 'd': + debug_on = true; + break; + case 'f': + filter = filter | FILTER_UNRELEASE; + break; + case 'm': + set_single_cmp(compare_page_num, SORT_DESC); + break; + case 'p': + set_single_cmp(compare_pid, SORT_ASC); + break; + case 'r': + set_single_cmp(compare_free_ts, SORT_ASC); + break; + case 's': + set_single_cmp(compare_stacktrace, SORT_ASC); + break; + case 't': + set_single_cmp(compare_num, SORT_DESC); + break; + case 'P': + set_single_cmp(compare_tgid, SORT_ASC); + break; + case 'n': + set_single_cmp(compare_comm, SORT_ASC); + break; + case 1: + filter = filter | FILTER_PID; + fc.pids = parse_nums_list(optarg, &fc.pids_size); + if (fc.pids == NULL) { + fprintf(stderr, "wrong/invalid pid in from the command line:%s\n", + optarg); + exit(1); + } + break; + case 2: + filter = filter | FILTER_TGID; + fc.tgids = parse_nums_list(optarg, &fc.tgids_size); + if (fc.tgids == NULL) { + fprintf(stderr, "wrong/invalid tgid in from the command line:%s\n", + optarg); + exit(1); + } + break; + case 3: + filter = filter | FILTER_COMM; + fc.comms = explode(',', optarg, &fc.comms_size); + break; + case 4: + if (!parse_cull_args(optarg)) { + fprintf(stderr, "wrong argument after --cull option:%s\n", + optarg); + exit(1); + } + break; + case 5: + if (!parse_sort_args(optarg)) { + fprintf(stderr, "wrong argument after --sort option:%s\n", + optarg); + exit(1); + } + break; + default: + usage(); + exit(1); + } + + if (optind >= (argc - 1)) { + usage(); + exit(1); + } + + fin = fopen(argv[optind], "r"); + fout = fopen(argv[optind + 1], "w"); + if (!fin || !fout) { + usage(); + perror("open: "); + exit(1); + } + + if (!check_regcomp(&order_pattern, "order\\s*([0-9]*),")) + goto out_order; + if (!check_regcomp(&pid_pattern, "pid\\s*([0-9]*),")) + goto out_pid; + if (!check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) ")) + goto out_tgid; + if (!check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts")) + goto out_comm; + if (!check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,")) + goto out_ts; + if (!check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns")) + goto out_free_ts; + + fstat(fileno(fin), &st); + max_size = st.st_size / 100; /* hack ... */ + + list = malloc(max_size * sizeof(*list)); + buf = malloc(BUF_SIZE); + ext_buf = malloc(BUF_SIZE); + if (!list || !buf || !ext_buf) { + fprintf(stderr, "Out of memory\n"); + goto out_free; + } + + for ( ; ; ) { + int buf_len = read_block(buf, ext_buf, BUF_SIZE, fin); + + if (buf_len < 0) + break; + if (!add_list(buf, buf_len, ext_buf)) + goto out_free; + } + + printf("loaded %d\n", list_size); + + printf("sorting ....\n"); + + qsort(list, list_size, sizeof(list[0]), compare_cull_condition); + + printf("culling\n"); + + for (i = count = 0; i < list_size; i++) { + if (count == 0 || + compare_cull_condition((void *)(&list[count-1]), (void *)(&list[i])) != 0) { + list[count++] = list[i]; + } else { + list[count-1].num += list[i].num; + list[count-1].page_num += list[i].page_num; + } + } + + qsort(list, count, sizeof(list[0]), compare_sort_condition); + + for (i = 0; i < count; i++) { + if (cull == 0) { + fprintf(fout, "%d times, %d pages, ", list[i].num, list[i].page_num); + print_allocator(fout, list[i].allocator); + fprintf(fout, ":\n%s\n", list[i].txt); + } + else { + fprintf(fout, "%d times, %d pages", + list[i].num, list[i].page_num); + if (cull & CULL_PID || filter & FILTER_PID) + fprintf(fout, ", PID %d", list[i].pid); + if (cull & CULL_TGID || filter & FILTER_TGID) + fprintf(fout, ", TGID %d", list[i].pid); + if (cull & CULL_COMM || filter & FILTER_COMM) + fprintf(fout, ", task_comm_name: %s", list[i].comm); + if (cull & CULL_ALLOCATOR) { + fprintf(fout, ", "); + print_allocator(fout, list[i].allocator); + } + if (cull & CULL_UNRELEASE) + fprintf(fout, " (%s)", + list[i].free_ts_nsec ? "UNRELEASED" : "RELEASED"); + if (cull & CULL_STACKTRACE) + fprintf(fout, ":\n%s", list[i].stacktrace); + fprintf(fout, "\n"); + } + } + +out_free: + if (ext_buf) + free(ext_buf); + if (buf) + free(buf); + if (list) + free(list); +out_free_ts: + regfree(&free_ts_nsec_pattern); +out_ts: + regfree(&ts_nsec_pattern); +out_comm: + regfree(&comm_pattern); +out_tgid: + regfree(&tgid_pattern); +out_pid: + regfree(&pid_pattern); +out_order: + regfree(&order_pattern); + + return 0; +} diff --git a/tools/mm/slabinfo-gnuplot.sh b/tools/mm/slabinfo-gnuplot.sh new file mode 100644 index 000000000000..873a892147e5 --- /dev/null +++ b/tools/mm/slabinfo-gnuplot.sh @@ -0,0 +1,268 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-only + +# Sergey Senozhatsky, 2015 +# sergey.senozhatsky.work@gmail.com +# + + +# This program is intended to plot a `slabinfo -X' stats, collected, +# for example, using the following command: +# while [ 1 ]; do slabinfo -X >> stats; sleep 1; done +# +# Use `slabinfo-gnuplot.sh stats' to pre-process collected records +# and generate graphs (totals, slabs sorted by size, slabs sorted +# by size). +# +# Graphs can be [individually] regenerate with different ranges and +# size (-r %d,%d and -s %d,%d options). +# +# To visually compare N `totals' graphs, do +# slabinfo-gnuplot.sh -t FILE1-totals FILE2-totals ... FILEN-totals +# + +min_slab_name_size=11 +xmin=0 +xmax=0 +width=1500 +height=700 +mode=preprocess + +usage() +{ + echo "Usage: [-s W,H] [-r MIN,MAX] [-t|-l] FILE1 [FILE2 ..]" + echo "FILEs must contain 'slabinfo -X' samples" + echo "-t - plot totals for FILE(s)" + echo "-l - plot slabs stats for FILE(s)" + echo "-s %d,%d - set image width and height" + echo "-r %d,%d - use data samples from a given range" +} + +check_file_exist() +{ + if [ ! -f "$1" ]; then + echo "File '$1' does not exist" + exit 1 + fi +} + +do_slabs_plotting() +{ + local file=$1 + local out_file + local range="every ::$xmin" + local xtic="" + local xtic_rotate="norotate" + local lines=2000000 + local wc_lines + + check_file_exist "$file" + + out_file=`basename "$file"` + if [ $xmax -ne 0 ]; then + range="$range::$xmax" + lines=$((xmax-xmin)) + fi + + wc_lines=`cat "$file" | wc -l` + if [ $? -ne 0 ] || [ "$wc_lines" -eq 0 ] ; then + wc_lines=$lines + fi + + if [ "$wc_lines" -lt "$lines" ]; then + lines=$wc_lines + fi + + if [ $((width / lines)) -gt $min_slab_name_size ]; then + xtic=":xtic(1)" + xtic_rotate=90 + fi + +gnuplot -p << EOF +#!/usr/bin/env gnuplot + +set terminal png enhanced size $width,$height large +set output '$out_file.png' +set autoscale xy +set xlabel 'samples' +set ylabel 'bytes' +set style histogram columnstacked title textcolor lt -1 +set style fill solid 0.15 +set xtics rotate $xtic_rotate +set key left above Left title reverse + +plot "$file" $range u 2$xtic title 'SIZE' with boxes,\ + '' $range u 3 title 'LOSS' with boxes +EOF + + if [ $? -eq 0 ]; then + echo "$out_file.png" + fi +} + +do_totals_plotting() +{ + local gnuplot_cmd="" + local range="every ::$xmin" + local file="" + + if [ $xmax -ne 0 ]; then + range="$range::$xmax" + fi + + for i in "${t_files[@]}"; do + check_file_exist "$i" + + file="$file"`basename "$i"` + gnuplot_cmd="$gnuplot_cmd '$i' $range using 1 title\ + '$i Memory usage' with lines," + gnuplot_cmd="$gnuplot_cmd '' $range using 2 title \ + '$i Loss' with lines," + done + +gnuplot -p << EOF +#!/usr/bin/env gnuplot + +set terminal png enhanced size $width,$height large +set autoscale xy +set output '$file.png' +set xlabel 'samples' +set ylabel 'bytes' +set key left above Left title reverse + +plot $gnuplot_cmd +EOF + + if [ $? -eq 0 ]; then + echo "$file.png" + fi +} + +do_preprocess() +{ + local out + local lines + local in=$1 + + check_file_exist "$in" + + # use only 'TOP' slab (biggest memory usage or loss) + let lines=3 + out=`basename "$in"`"-slabs-by-loss" + `cat "$in" | grep -A "$lines" 'Slabs sorted by loss' |\ + grep -E -iv '\-\-|Name|Slabs'\ + | awk '{print $1" "$4+$2*$3" "$4}' > "$out"` + if [ $? -eq 0 ]; then + do_slabs_plotting "$out" + fi + + let lines=3 + out=`basename "$in"`"-slabs-by-size" + `cat "$in" | grep -A "$lines" 'Slabs sorted by size' |\ + grep -E -iv '\-\-|Name|Slabs'\ + | awk '{print $1" "$4" "$4-$2*$3}' > "$out"` + if [ $? -eq 0 ]; then + do_slabs_plotting "$out" + fi + + out=`basename "$in"`"-totals" + `cat "$in" | grep "Memory used" |\ + awk '{print $3" "$7}' > "$out"` + if [ $? -eq 0 ]; then + t_files[0]=$out + do_totals_plotting + fi +} + +parse_opts() +{ + local opt + + while getopts "tlr::s::h" opt; do + case $opt in + t) + mode=totals + ;; + l) + mode=slabs + ;; + s) + array=(${OPTARG//,/ }) + width=${array[0]} + height=${array[1]} + ;; + r) + array=(${OPTARG//,/ }) + xmin=${array[0]} + xmax=${array[1]} + ;; + h) + usage + exit 0 + ;; + \?) + echo "Invalid option: -$OPTARG" >&2 + exit 1 + ;; + :) + echo "-$OPTARG requires an argument." >&2 + exit 1 + ;; + esac + done + + return $OPTIND +} + +parse_args() +{ + local idx=0 + local p + + for p in "$@"; do + case $mode in + preprocess) + files[$idx]=$p + idx=$idx+1 + ;; + totals) + t_files[$idx]=$p + idx=$idx+1 + ;; + slabs) + files[$idx]=$p + idx=$idx+1 + ;; + esac + done +} + +parse_opts "$@" +argstart=$? +parse_args "${@:$argstart}" + +if [ ${#files[@]} -eq 0 ] && [ ${#t_files[@]} -eq 0 ]; then + usage + exit 1 +fi + +case $mode in + preprocess) + for i in "${files[@]}"; do + do_preprocess "$i" + done + ;; + totals) + do_totals_plotting + ;; + slabs) + for i in "${files[@]}"; do + do_slabs_plotting "$i" + done + ;; + *) + echo "Unknown mode $mode" >&2 + usage + exit 1 + ;; +esac diff --git a/tools/mm/slabinfo.c b/tools/mm/slabinfo.c new file mode 100644 index 000000000000..cfaeaea71042 --- /dev/null +++ b/tools/mm/slabinfo.c @@ -0,0 +1,1544 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Slabinfo: Tool to get reports about slabs + * + * (C) 2007 sgi, Christoph Lameter + * (C) 2011 Linux Foundation, Christoph Lameter + * + * Compile with: + * + * gcc -o slabinfo slabinfo.c + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_SLABS 500 +#define MAX_ALIASES 500 +#define MAX_NODES 1024 + +struct slabinfo { + char *name; + int alias; + int refs; + int aliases, align, cache_dma, cpu_slabs, destroy_by_rcu; + unsigned int hwcache_align, object_size, objs_per_slab; + unsigned int sanity_checks, slab_size, store_user, trace; + int order, poison, reclaim_account, red_zone; + unsigned long partial, objects, slabs, objects_partial, objects_total; + unsigned long alloc_fastpath, alloc_slowpath; + unsigned long free_fastpath, free_slowpath; + unsigned long free_frozen, free_add_partial, free_remove_partial; + unsigned long alloc_from_partial, alloc_slab, free_slab, alloc_refill; + unsigned long cpuslab_flush, deactivate_full, deactivate_empty; + unsigned long deactivate_to_head, deactivate_to_tail; + unsigned long deactivate_remote_frees, order_fallback; + unsigned long cmpxchg_double_cpu_fail, cmpxchg_double_fail; + unsigned long alloc_node_mismatch, deactivate_bypass; + unsigned long cpu_partial_alloc, cpu_partial_free; + int numa[MAX_NODES]; + int numa_partial[MAX_NODES]; +} slabinfo[MAX_SLABS]; + +struct aliasinfo { + char *name; + char *ref; + struct slabinfo *slab; +} aliasinfo[MAX_ALIASES]; + +int slabs; +int actual_slabs; +int aliases; +int alias_targets; +int highest_node; + +char buffer[4096]; + +int show_empty; +int show_report; +int show_alias; +int show_slab; +int skip_zero = 1; +int show_numa; +int show_track; +int show_first_alias; +int validate; +int shrink; +int show_inverted; +int show_single_ref; +int show_totals; +int sort_size; +int sort_active; +int set_debug; +int show_ops; +int sort_partial; +int show_activity; +int output_lines = -1; +int sort_loss; +int extended_totals; +int show_bytes; +int unreclaim_only; + +/* Debug options */ +int sanity; +int redzone; +int poison; +int tracking; +int tracing; + +int page_size; + +regex_t pattern; + +static void fatal(const char *x, ...) +{ + va_list ap; + + va_start(ap, x); + vfprintf(stderr, x, ap); + va_end(ap); + exit(EXIT_FAILURE); +} + +static void usage(void) +{ + printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n" + "slabinfo [-aABDefhilLnoPrsStTUvXz1] [N=K] [-dafzput] [slab-regexp]\n" + "-a|--aliases Show aliases\n" + "-A|--activity Most active slabs first\n" + "-B|--Bytes Show size in bytes\n" + "-D|--display-active Switch line format to activity\n" + "-e|--empty Show empty slabs\n" + "-f|--first-alias Show first alias\n" + "-h|--help Show usage information\n" + "-i|--inverted Inverted list\n" + "-l|--slabs Show slabs\n" + "-L|--Loss Sort by loss\n" + "-n|--numa Show NUMA information\n" + "-N|--lines=K Show the first K slabs\n" + "-o|--ops Show kmem_cache_ops\n" + "-P|--partial Sort by number of partial slabs\n" + "-r|--report Detailed report on single slabs\n" + "-s|--shrink Shrink slabs\n" + "-S|--Size Sort by size\n" + "-t|--tracking Show alloc/free information\n" + "-T|--Totals Show summary information\n" + "-U|--Unreclaim Show unreclaimable slabs only\n" + "-v|--validate Validate slabs\n" + "-X|--Xtotals Show extended summary information\n" + "-z|--zero Include empty slabs\n" + "-1|--1ref Single reference\n" + + "\n" + "-d | --debug Switch off all debug options\n" + "-da | --debug=a Switch on all debug options (--debug=FZPU)\n" + + "\n" + "-d[afzput] | --debug=[afzput]\n" + " f | F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n" + " z | Z Redzoning\n" + " p | P Poisoning\n" + " u | U Tracking\n" + " t | T Tracing\n" + + "\nSorting options (--Loss, --Size, --Partial) are mutually exclusive\n" + ); +} + +static unsigned long read_obj(const char *name) +{ + FILE *f = fopen(name, "r"); + + if (!f) { + buffer[0] = 0; + if (errno == EACCES) + fatal("%s, Try using superuser\n", strerror(errno)); + } else { + if (!fgets(buffer, sizeof(buffer), f)) + buffer[0] = 0; + fclose(f); + if (buffer[strlen(buffer)] == '\n') + buffer[strlen(buffer)] = 0; + } + return strlen(buffer); +} + + +/* + * Get the contents of an attribute + */ +static unsigned long get_obj(const char *name) +{ + if (!read_obj(name)) + return 0; + + return atol(buffer); +} + +static unsigned long get_obj_and_str(const char *name, char **x) +{ + unsigned long result = 0; + char *p; + + *x = NULL; + + if (!read_obj(name)) { + x = NULL; + return 0; + } + result = strtoul(buffer, &p, 10); + while (*p == ' ') + p++; + if (*p) + *x = strdup(p); + return result; +} + +static void set_obj(struct slabinfo *s, const char *name, int n) +{ + char x[100]; + FILE *f; + + snprintf(x, 100, "%s/%s", s->name, name); + f = fopen(x, "w"); + if (!f) + fatal("Cannot write to %s\n", x); + + fprintf(f, "%d\n", n); + fclose(f); +} + +static unsigned long read_slab_obj(struct slabinfo *s, const char *name) +{ + char x[100]; + FILE *f; + size_t l; + + snprintf(x, 100, "%s/%s", s->name, name); + f = fopen(x, "r"); + if (!f) { + buffer[0] = 0; + l = 0; + } else { + l = fread(buffer, 1, sizeof(buffer), f); + buffer[l] = 0; + fclose(f); + } + return l; +} + +static unsigned long read_debug_slab_obj(struct slabinfo *s, const char *name) +{ + char x[128]; + FILE *f; + size_t l; + + snprintf(x, 128, "/sys/kernel/debug/slab/%s/%s", s->name, name); + f = fopen(x, "r"); + if (!f) { + buffer[0] = 0; + l = 0; + } else { + l = fread(buffer, 1, sizeof(buffer), f); + buffer[l] = 0; + fclose(f); + } + return l; +} + +/* + * Put a size string together + */ +static int store_size(char *buffer, unsigned long value) +{ + unsigned long divisor = 1; + char trailer = 0; + int n; + + if (!show_bytes) { + if (value > 1000000000UL) { + divisor = 100000000UL; + trailer = 'G'; + } else if (value > 1000000UL) { + divisor = 100000UL; + trailer = 'M'; + } else if (value > 1000UL) { + divisor = 100; + trailer = 'K'; + } + } + + value /= divisor; + n = sprintf(buffer, "%ld",value); + if (trailer) { + buffer[n] = trailer; + n++; + buffer[n] = 0; + } + if (divisor != 1) { + memmove(buffer + n - 2, buffer + n - 3, 4); + buffer[n-2] = '.'; + n++; + } + return n; +} + +static void decode_numa_list(int *numa, char *t) +{ + int node; + int nr; + + memset(numa, 0, MAX_NODES * sizeof(int)); + + if (!t) + return; + + while (*t == 'N') { + t++; + node = strtoul(t, &t, 10); + if (*t == '=') { + t++; + nr = strtoul(t, &t, 10); + numa[node] = nr; + if (node > highest_node) + highest_node = node; + } + while (*t == ' ') + t++; + } +} + +static void slab_validate(struct slabinfo *s) +{ + if (strcmp(s->name, "*") == 0) + return; + + set_obj(s, "validate", 1); +} + +static void slab_shrink(struct slabinfo *s) +{ + if (strcmp(s->name, "*") == 0) + return; + + set_obj(s, "shrink", 1); +} + +int line = 0; + +static void first_line(void) +{ + if (show_activity) + printf("Name Objects Alloc Free" + " %%Fast Fallb O CmpX UL\n"); + else + printf("Name Objects Objsize %s " + "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n", + sort_loss ? " Loss" : "Space"); +} + +/* + * Find the shortest alias of a slab + */ +static struct aliasinfo *find_one_alias(struct slabinfo *find) +{ + struct aliasinfo *a; + struct aliasinfo *best = NULL; + + for(a = aliasinfo;a < aliasinfo + aliases; a++) { + if (a->slab == find && + (!best || strlen(best->name) < strlen(a->name))) { + best = a; + if (strncmp(a->name,"kmall", 5) == 0) + return best; + } + } + return best; +} + +static unsigned long slab_size(struct slabinfo *s) +{ + return s->slabs * (page_size << s->order); +} + +static unsigned long slab_activity(struct slabinfo *s) +{ + return s->alloc_fastpath + s->free_fastpath + + s->alloc_slowpath + s->free_slowpath; +} + +static unsigned long slab_waste(struct slabinfo *s) +{ + return slab_size(s) - s->objects * s->object_size; +} + +static void slab_numa(struct slabinfo *s, int mode) +{ + int node; + + if (strcmp(s->name, "*") == 0) + return; + + if (!highest_node) { + printf("\n%s: No NUMA information available.\n", s->name); + return; + } + + if (skip_zero && !s->slabs) + return; + + if (!line) { + printf("\n%-21s:", mode ? "NUMA nodes" : "Slab"); + for(node = 0; node <= highest_node; node++) + printf(" %4d", node); + printf("\n----------------------"); + for(node = 0; node <= highest_node; node++) + printf("-----"); + printf("\n"); + } + printf("%-21s ", mode ? "All slabs" : s->name); + for(node = 0; node <= highest_node; node++) { + char b[20]; + + store_size(b, s->numa[node]); + printf(" %4s", b); + } + printf("\n"); + if (mode) { + printf("%-21s ", "Partial slabs"); + for(node = 0; node <= highest_node; node++) { + char b[20]; + + store_size(b, s->numa_partial[node]); + printf(" %4s", b); + } + printf("\n"); + } + line++; +} + +static void show_tracking(struct slabinfo *s) +{ + printf("\n%s: Kernel object allocation\n", s->name); + printf("-----------------------------------------------------------------------\n"); + if (read_debug_slab_obj(s, "alloc_traces")) + printf("%s", buffer); + else if (read_slab_obj(s, "alloc_calls")) + printf("%s", buffer); + else + printf("No Data\n"); + + printf("\n%s: Kernel object freeing\n", s->name); + printf("------------------------------------------------------------------------\n"); + if (read_debug_slab_obj(s, "free_traces")) + printf("%s", buffer); + else if (read_slab_obj(s, "free_calls")) + printf("%s", buffer); + else + printf("No Data\n"); + +} + +static void ops(struct slabinfo *s) +{ + if (strcmp(s->name, "*") == 0) + return; + + if (read_slab_obj(s, "ops")) { + printf("\n%s: kmem_cache operations\n", s->name); + printf("--------------------------------------------\n"); + printf("%s", buffer); + } else + printf("\n%s has no kmem_cache operations\n", s->name); +} + +static const char *onoff(int x) +{ + if (x) + return "On "; + return "Off"; +} + +static void slab_stats(struct slabinfo *s) +{ + unsigned long total_alloc; + unsigned long total_free; + unsigned long total; + + if (!s->alloc_slab) + return; + + total_alloc = s->alloc_fastpath + s->alloc_slowpath; + total_free = s->free_fastpath + s->free_slowpath; + + if (!total_alloc) + return; + + printf("\n"); + printf("Slab Perf Counter Alloc Free %%Al %%Fr\n"); + printf("--------------------------------------------------\n"); + printf("Fastpath %8lu %8lu %3lu %3lu\n", + s->alloc_fastpath, s->free_fastpath, + s->alloc_fastpath * 100 / total_alloc, + total_free ? s->free_fastpath * 100 / total_free : 0); + printf("Slowpath %8lu %8lu %3lu %3lu\n", + total_alloc - s->alloc_fastpath, s->free_slowpath, + (total_alloc - s->alloc_fastpath) * 100 / total_alloc, + total_free ? s->free_slowpath * 100 / total_free : 0); + printf("Page Alloc %8lu %8lu %3lu %3lu\n", + s->alloc_slab, s->free_slab, + s->alloc_slab * 100 / total_alloc, + total_free ? s->free_slab * 100 / total_free : 0); + printf("Add partial %8lu %8lu %3lu %3lu\n", + s->deactivate_to_head + s->deactivate_to_tail, + s->free_add_partial, + (s->deactivate_to_head + s->deactivate_to_tail) * 100 / total_alloc, + total_free ? s->free_add_partial * 100 / total_free : 0); + printf("Remove partial %8lu %8lu %3lu %3lu\n", + s->alloc_from_partial, s->free_remove_partial, + s->alloc_from_partial * 100 / total_alloc, + total_free ? s->free_remove_partial * 100 / total_free : 0); + + printf("Cpu partial list %8lu %8lu %3lu %3lu\n", + s->cpu_partial_alloc, s->cpu_partial_free, + s->cpu_partial_alloc * 100 / total_alloc, + total_free ? s->cpu_partial_free * 100 / total_free : 0); + + printf("RemoteObj/SlabFrozen %8lu %8lu %3lu %3lu\n", + s->deactivate_remote_frees, s->free_frozen, + s->deactivate_remote_frees * 100 / total_alloc, + total_free ? s->free_frozen * 100 / total_free : 0); + + printf("Total %8lu %8lu\n\n", total_alloc, total_free); + + if (s->cpuslab_flush) + printf("Flushes %8lu\n", s->cpuslab_flush); + + total = s->deactivate_full + s->deactivate_empty + + s->deactivate_to_head + s->deactivate_to_tail + s->deactivate_bypass; + + if (total) { + printf("\nSlab Deactivation Occurrences %%\n"); + printf("-------------------------------------------------\n"); + printf("Slab full %7lu %3lu%%\n", + s->deactivate_full, (s->deactivate_full * 100) / total); + printf("Slab empty %7lu %3lu%%\n", + s->deactivate_empty, (s->deactivate_empty * 100) / total); + printf("Moved to head of partial list %7lu %3lu%%\n", + s->deactivate_to_head, (s->deactivate_to_head * 100) / total); + printf("Moved to tail of partial list %7lu %3lu%%\n", + s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total); + printf("Deactivation bypass %7lu %3lu%%\n", + s->deactivate_bypass, (s->deactivate_bypass * 100) / total); + printf("Refilled from foreign frees %7lu %3lu%%\n", + s->alloc_refill, (s->alloc_refill * 100) / total); + printf("Node mismatch %7lu %3lu%%\n", + s->alloc_node_mismatch, (s->alloc_node_mismatch * 100) / total); + } + + if (s->cmpxchg_double_fail || s->cmpxchg_double_cpu_fail) { + printf("\nCmpxchg_double Looping\n------------------------\n"); + printf("Locked Cmpxchg Double redos %lu\nUnlocked Cmpxchg Double redos %lu\n", + s->cmpxchg_double_fail, s->cmpxchg_double_cpu_fail); + } +} + +static void report(struct slabinfo *s) +{ + if (strcmp(s->name, "*") == 0) + return; + + printf("\nSlabcache: %-15s Aliases: %2d Order : %2d Objects: %lu\n", + s->name, s->aliases, s->order, s->objects); + if (s->hwcache_align) + printf("** Hardware cacheline aligned\n"); + if (s->cache_dma) + printf("** Memory is allocated in a special DMA zone\n"); + if (s->destroy_by_rcu) + printf("** Slabs are destroyed via RCU\n"); + if (s->reclaim_account) + printf("** Reclaim accounting active\n"); + + printf("\nSizes (bytes) Slabs Debug Memory\n"); + printf("------------------------------------------------------------------------\n"); + printf("Object : %7d Total : %7ld Sanity Checks : %s Total: %7ld\n", + s->object_size, s->slabs, onoff(s->sanity_checks), + s->slabs * (page_size << s->order)); + printf("SlabObj: %7d Full : %7ld Redzoning : %s Used : %7ld\n", + s->slab_size, s->slabs - s->partial - s->cpu_slabs, + onoff(s->red_zone), s->objects * s->object_size); + printf("SlabSiz: %7d Partial: %7ld Poisoning : %s Loss : %7ld\n", + page_size << s->order, s->partial, onoff(s->poison), + s->slabs * (page_size << s->order) - s->objects * s->object_size); + printf("Loss : %7d CpuSlab: %7d Tracking : %s Lalig: %7ld\n", + s->slab_size - s->object_size, s->cpu_slabs, onoff(s->store_user), + (s->slab_size - s->object_size) * s->objects); + printf("Align : %7d Objects: %7d Tracing : %s Lpadd: %7ld\n", + s->align, s->objs_per_slab, onoff(s->trace), + ((page_size << s->order) - s->objs_per_slab * s->slab_size) * + s->slabs); + + ops(s); + show_tracking(s); + slab_numa(s, 1); + slab_stats(s); +} + +static void slabcache(struct slabinfo *s) +{ + char size_str[20]; + char dist_str[40]; + char flags[20]; + char *p = flags; + + if (strcmp(s->name, "*") == 0) + return; + + if (unreclaim_only && s->reclaim_account) + return; + + if (actual_slabs == 1) { + report(s); + return; + } + + if (skip_zero && !show_empty && !s->slabs) + return; + + if (show_empty && s->slabs) + return; + + if (sort_loss == 0) + store_size(size_str, slab_size(s)); + else + store_size(size_str, slab_waste(s)); + snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs, + s->partial, s->cpu_slabs); + + if (!line++) + first_line(); + + if (s->aliases) + *p++ = '*'; + if (s->cache_dma) + *p++ = 'd'; + if (s->hwcache_align) + *p++ = 'A'; + if (s->poison) + *p++ = 'P'; + if (s->reclaim_account) + *p++ = 'a'; + if (s->red_zone) + *p++ = 'Z'; + if (s->sanity_checks) + *p++ = 'F'; + if (s->store_user) + *p++ = 'U'; + if (s->trace) + *p++ = 'T'; + + *p = 0; + if (show_activity) { + unsigned long total_alloc; + unsigned long total_free; + + total_alloc = s->alloc_fastpath + s->alloc_slowpath; + total_free = s->free_fastpath + s->free_slowpath; + + printf("%-21s %8ld %10ld %10ld %3ld %3ld %5ld %1d %4ld %4ld\n", + s->name, s->objects, + total_alloc, total_free, + total_alloc ? (s->alloc_fastpath * 100 / total_alloc) : 0, + total_free ? (s->free_fastpath * 100 / total_free) : 0, + s->order_fallback, s->order, s->cmpxchg_double_fail, + s->cmpxchg_double_cpu_fail); + } else { + printf("%-21s %8ld %7d %15s %14s %4d %1d %3ld %3ld %s\n", + s->name, s->objects, s->object_size, size_str, dist_str, + s->objs_per_slab, s->order, + s->slabs ? (s->partial * 100) / s->slabs : 100, + s->slabs ? (s->objects * s->object_size * 100) / + (s->slabs * (page_size << s->order)) : 100, + flags); + } +} + +/* + * Analyze debug options. Return false if something is amiss. + */ +static int debug_opt_scan(char *opt) +{ + if (!opt || !opt[0] || strcmp(opt, "-") == 0) + return 1; + + if (strcasecmp(opt, "a") == 0) { + sanity = 1; + poison = 1; + redzone = 1; + tracking = 1; + return 1; + } + + for ( ; *opt; opt++) + switch (*opt) { + case 'F' : case 'f': + if (sanity) + return 0; + sanity = 1; + break; + case 'P' : case 'p': + if (poison) + return 0; + poison = 1; + break; + + case 'Z' : case 'z': + if (redzone) + return 0; + redzone = 1; + break; + + case 'U' : case 'u': + if (tracking) + return 0; + tracking = 1; + break; + + case 'T' : case 't': + if (tracing) + return 0; + tracing = 1; + break; + default: + return 0; + } + return 1; +} + +static int slab_empty(struct slabinfo *s) +{ + if (s->objects > 0) + return 0; + + /* + * We may still have slabs even if there are no objects. Shrinking will + * remove them. + */ + if (s->slabs != 0) + set_obj(s, "shrink", 1); + + return 1; +} + +static void slab_debug(struct slabinfo *s) +{ + if (strcmp(s->name, "*") == 0) + return; + + if (sanity && !s->sanity_checks) { + set_obj(s, "sanity_checks", 1); + } + if (!sanity && s->sanity_checks) { + if (slab_empty(s)) + set_obj(s, "sanity_checks", 0); + else + fprintf(stderr, "%s not empty cannot disable sanity checks\n", s->name); + } + if (redzone && !s->red_zone) { + if (slab_empty(s)) + set_obj(s, "red_zone", 1); + else + fprintf(stderr, "%s not empty cannot enable redzoning\n", s->name); + } + if (!redzone && s->red_zone) { + if (slab_empty(s)) + set_obj(s, "red_zone", 0); + else + fprintf(stderr, "%s not empty cannot disable redzoning\n", s->name); + } + if (poison && !s->poison) { + if (slab_empty(s)) + set_obj(s, "poison", 1); + else + fprintf(stderr, "%s not empty cannot enable poisoning\n", s->name); + } + if (!poison && s->poison) { + if (slab_empty(s)) + set_obj(s, "poison", 0); + else + fprintf(stderr, "%s not empty cannot disable poisoning\n", s->name); + } + if (tracking && !s->store_user) { + if (slab_empty(s)) + set_obj(s, "store_user", 1); + else + fprintf(stderr, "%s not empty cannot enable tracking\n", s->name); + } + if (!tracking && s->store_user) { + if (slab_empty(s)) + set_obj(s, "store_user", 0); + else + fprintf(stderr, "%s not empty cannot disable tracking\n", s->name); + } + if (tracing && !s->trace) { + if (slabs == 1) + set_obj(s, "trace", 1); + else + fprintf(stderr, "%s can only enable trace for one slab at a time\n", s->name); + } + if (!tracing && s->trace) + set_obj(s, "trace", 1); +} + +static void totals(void) +{ + struct slabinfo *s; + + int used_slabs = 0; + char b1[20], b2[20], b3[20], b4[20]; + unsigned long long max = 1ULL << 63; + + /* Object size */ + unsigned long long min_objsize = max, max_objsize = 0, avg_objsize; + + /* Number of partial slabs in a slabcache */ + unsigned long long min_partial = max, max_partial = 0, + avg_partial, total_partial = 0; + + /* Number of slabs in a slab cache */ + unsigned long long min_slabs = max, max_slabs = 0, + avg_slabs, total_slabs = 0; + + /* Size of the whole slab */ + unsigned long long min_size = max, max_size = 0, + avg_size, total_size = 0; + + /* Bytes used for object storage in a slab */ + unsigned long long min_used = max, max_used = 0, + avg_used, total_used = 0; + + /* Waste: Bytes used for alignment and padding */ + unsigned long long min_waste = max, max_waste = 0, + avg_waste, total_waste = 0; + /* Number of objects in a slab */ + unsigned long long min_objects = max, max_objects = 0, + avg_objects, total_objects = 0; + /* Waste per object */ + unsigned long long min_objwaste = max, + max_objwaste = 0, avg_objwaste, + total_objwaste = 0; + + /* Memory per object */ + unsigned long long min_memobj = max, + max_memobj = 0, avg_memobj, + total_objsize = 0; + + /* Percentage of partial slabs per slab */ + unsigned long min_ppart = 100, max_ppart = 0, + avg_ppart, total_ppart = 0; + + /* Number of objects in partial slabs */ + unsigned long min_partobj = max, max_partobj = 0, + avg_partobj, total_partobj = 0; + + /* Percentage of partial objects of all objects in a slab */ + unsigned long min_ppartobj = 100, max_ppartobj = 0, + avg_ppartobj, total_ppartobj = 0; + + + for (s = slabinfo; s < slabinfo + slabs; s++) { + unsigned long long size; + unsigned long used; + unsigned long long wasted; + unsigned long long objwaste; + unsigned long percentage_partial_slabs; + unsigned long percentage_partial_objs; + + if (!s->slabs || !s->objects) + continue; + + used_slabs++; + + size = slab_size(s); + used = s->objects * s->object_size; + wasted = size - used; + objwaste = s->slab_size - s->object_size; + + percentage_partial_slabs = s->partial * 100 / s->slabs; + if (percentage_partial_slabs > 100) + percentage_partial_slabs = 100; + + percentage_partial_objs = s->objects_partial * 100 + / s->objects; + + if (percentage_partial_objs > 100) + percentage_partial_objs = 100; + + if (s->object_size < min_objsize) + min_objsize = s->object_size; + if (s->partial < min_partial) + min_partial = s->partial; + if (s->slabs < min_slabs) + min_slabs = s->slabs; + if (size < min_size) + min_size = size; + if (wasted < min_waste) + min_waste = wasted; + if (objwaste < min_objwaste) + min_objwaste = objwaste; + if (s->objects < min_objects) + min_objects = s->objects; + if (used < min_used) + min_used = used; + if (s->objects_partial < min_partobj) + min_partobj = s->objects_partial; + if (percentage_partial_slabs < min_ppart) + min_ppart = percentage_partial_slabs; + if (percentage_partial_objs < min_ppartobj) + min_ppartobj = percentage_partial_objs; + if (s->slab_size < min_memobj) + min_memobj = s->slab_size; + + if (s->object_size > max_objsize) + max_objsize = s->object_size; + if (s->partial > max_partial) + max_partial = s->partial; + if (s->slabs > max_slabs) + max_slabs = s->slabs; + if (size > max_size) + max_size = size; + if (wasted > max_waste) + max_waste = wasted; + if (objwaste > max_objwaste) + max_objwaste = objwaste; + if (s->objects > max_objects) + max_objects = s->objects; + if (used > max_used) + max_used = used; + if (s->objects_partial > max_partobj) + max_partobj = s->objects_partial; + if (percentage_partial_slabs > max_ppart) + max_ppart = percentage_partial_slabs; + if (percentage_partial_objs > max_ppartobj) + max_ppartobj = percentage_partial_objs; + if (s->slab_size > max_memobj) + max_memobj = s->slab_size; + + total_partial += s->partial; + total_slabs += s->slabs; + total_size += size; + total_waste += wasted; + + total_objects += s->objects; + total_used += used; + total_partobj += s->objects_partial; + total_ppart += percentage_partial_slabs; + total_ppartobj += percentage_partial_objs; + + total_objwaste += s->objects * objwaste; + total_objsize += s->objects * s->slab_size; + } + + if (!total_objects) { + printf("No objects\n"); + return; + } + if (!used_slabs) { + printf("No slabs\n"); + return; + } + + /* Per slab averages */ + avg_partial = total_partial / used_slabs; + avg_slabs = total_slabs / used_slabs; + avg_size = total_size / used_slabs; + avg_waste = total_waste / used_slabs; + + avg_objects = total_objects / used_slabs; + avg_used = total_used / used_slabs; + avg_partobj = total_partobj / used_slabs; + avg_ppart = total_ppart / used_slabs; + avg_ppartobj = total_ppartobj / used_slabs; + + /* Per object object sizes */ + avg_objsize = total_used / total_objects; + avg_objwaste = total_objwaste / total_objects; + avg_partobj = total_partobj * 100 / total_objects; + avg_memobj = total_objsize / total_objects; + + printf("Slabcache Totals\n"); + printf("----------------\n"); + printf("Slabcaches : %15d Aliases : %11d->%-3d Active: %3d\n", + slabs, aliases, alias_targets, used_slabs); + + store_size(b1, total_size);store_size(b2, total_waste); + store_size(b3, total_waste * 100 / total_used); + printf("Memory used: %15s # Loss : %15s MRatio:%6s%%\n", b1, b2, b3); + + store_size(b1, total_objects);store_size(b2, total_partobj); + store_size(b3, total_partobj * 100 / total_objects); + printf("# Objects : %15s # PartObj: %15s ORatio:%6s%%\n", b1, b2, b3); + + printf("\n"); + printf("Per Cache Average " + "Min Max Total\n"); + printf("---------------------------------------" + "-------------------------------------\n"); + + store_size(b1, avg_objects);store_size(b2, min_objects); + store_size(b3, max_objects);store_size(b4, total_objects); + printf("#Objects %15s %15s %15s %15s\n", + b1, b2, b3, b4); + + store_size(b1, avg_slabs);store_size(b2, min_slabs); + store_size(b3, max_slabs);store_size(b4, total_slabs); + printf("#Slabs %15s %15s %15s %15s\n", + b1, b2, b3, b4); + + store_size(b1, avg_partial);store_size(b2, min_partial); + store_size(b3, max_partial);store_size(b4, total_partial); + printf("#PartSlab %15s %15s %15s %15s\n", + b1, b2, b3, b4); + store_size(b1, avg_ppart);store_size(b2, min_ppart); + store_size(b3, max_ppart); + store_size(b4, total_partial * 100 / total_slabs); + printf("%%PartSlab%15s%% %15s%% %15s%% %15s%%\n", + b1, b2, b3, b4); + + store_size(b1, avg_partobj);store_size(b2, min_partobj); + store_size(b3, max_partobj); + store_size(b4, total_partobj); + printf("PartObjs %15s %15s %15s %15s\n", + b1, b2, b3, b4); + + store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj); + store_size(b3, max_ppartobj); + store_size(b4, total_partobj * 100 / total_objects); + printf("%% PartObj%15s%% %15s%% %15s%% %15s%%\n", + b1, b2, b3, b4); + + store_size(b1, avg_size);store_size(b2, min_size); + store_size(b3, max_size);store_size(b4, total_size); + printf("Memory %15s %15s %15s %15s\n", + b1, b2, b3, b4); + + store_size(b1, avg_used);store_size(b2, min_used); + store_size(b3, max_used);store_size(b4, total_used); + printf("Used %15s %15s %15s %15s\n", + b1, b2, b3, b4); + + store_size(b1, avg_waste);store_size(b2, min_waste); + store_size(b3, max_waste);store_size(b4, total_waste); + printf("Loss %15s %15s %15s %15s\n", + b1, b2, b3, b4); + + printf("\n"); + printf("Per Object Average " + "Min Max\n"); + printf("---------------------------------------" + "--------------------\n"); + + store_size(b1, avg_memobj);store_size(b2, min_memobj); + store_size(b3, max_memobj); + printf("Memory %15s %15s %15s\n", + b1, b2, b3); + store_size(b1, avg_objsize);store_size(b2, min_objsize); + store_size(b3, max_objsize); + printf("User %15s %15s %15s\n", + b1, b2, b3); + + store_size(b1, avg_objwaste);store_size(b2, min_objwaste); + store_size(b3, max_objwaste); + printf("Loss %15s %15s %15s\n", + b1, b2, b3); +} + +static void sort_slabs(void) +{ + struct slabinfo *s1,*s2; + + for (s1 = slabinfo; s1 < slabinfo + slabs; s1++) { + for (s2 = s1 + 1; s2 < slabinfo + slabs; s2++) { + int result; + + if (sort_size) { + if (slab_size(s1) == slab_size(s2)) + result = strcasecmp(s1->name, s2->name); + else + result = slab_size(s1) < slab_size(s2); + } else if (sort_active) { + if (slab_activity(s1) == slab_activity(s2)) + result = strcasecmp(s1->name, s2->name); + else + result = slab_activity(s1) < slab_activity(s2); + } else if (sort_loss) { + if (slab_waste(s1) == slab_waste(s2)) + result = strcasecmp(s1->name, s2->name); + else + result = slab_waste(s1) < slab_waste(s2); + } else if (sort_partial) { + if (s1->partial == s2->partial) + result = strcasecmp(s1->name, s2->name); + else + result = s1->partial < s2->partial; + } else + result = strcasecmp(s1->name, s2->name); + + if (show_inverted) + result = -result; + + if (result > 0) { + struct slabinfo t; + + memcpy(&t, s1, sizeof(struct slabinfo)); + memcpy(s1, s2, sizeof(struct slabinfo)); + memcpy(s2, &t, sizeof(struct slabinfo)); + } + } + } +} + +static void sort_aliases(void) +{ + struct aliasinfo *a1,*a2; + + for (a1 = aliasinfo; a1 < aliasinfo + aliases; a1++) { + for (a2 = a1 + 1; a2 < aliasinfo + aliases; a2++) { + char *n1, *n2; + + n1 = a1->name; + n2 = a2->name; + if (show_alias && !show_inverted) { + n1 = a1->ref; + n2 = a2->ref; + } + if (strcasecmp(n1, n2) > 0) { + struct aliasinfo t; + + memcpy(&t, a1, sizeof(struct aliasinfo)); + memcpy(a1, a2, sizeof(struct aliasinfo)); + memcpy(a2, &t, sizeof(struct aliasinfo)); + } + } + } +} + +static void link_slabs(void) +{ + struct aliasinfo *a; + struct slabinfo *s; + + for (a = aliasinfo; a < aliasinfo + aliases; a++) { + + for (s = slabinfo; s < slabinfo + slabs; s++) + if (strcmp(a->ref, s->name) == 0) { + a->slab = s; + s->refs++; + break; + } + if (s == slabinfo + slabs) + fatal("Unresolved alias %s\n", a->ref); + } +} + +static void alias(void) +{ + struct aliasinfo *a; + char *active = NULL; + + sort_aliases(); + link_slabs(); + + for(a = aliasinfo; a < aliasinfo + aliases; a++) { + + if (!show_single_ref && a->slab->refs == 1) + continue; + + if (!show_inverted) { + if (active) { + if (strcmp(a->slab->name, active) == 0) { + printf(" %s", a->name); + continue; + } + } + printf("\n%-12s <- %s", a->slab->name, a->name); + active = a->slab->name; + } + else + printf("%-15s -> %s\n", a->name, a->slab->name); + } + if (active) + printf("\n"); +} + + +static void rename_slabs(void) +{ + struct slabinfo *s; + struct aliasinfo *a; + + for (s = slabinfo; s < slabinfo + slabs; s++) { + if (*s->name != ':') + continue; + + if (s->refs > 1 && !show_first_alias) + continue; + + a = find_one_alias(s); + + if (a) + s->name = a->name; + else { + s->name = "*"; + actual_slabs--; + } + } +} + +static int slab_mismatch(char *slab) +{ + return regexec(&pattern, slab, 0, NULL, 0); +} + +static void read_slab_dir(void) +{ + DIR *dir; + struct dirent *de; + struct slabinfo *slab = slabinfo; + struct aliasinfo *alias = aliasinfo; + char *p; + char *t; + int count; + + if (chdir("/sys/kernel/slab") && chdir("/sys/slab")) + fatal("SYSFS support for SLUB not active\n"); + + dir = opendir("."); + while ((de = readdir(dir))) { + if (de->d_name[0] == '.' || + (de->d_name[0] != ':' && slab_mismatch(de->d_name))) + continue; + switch (de->d_type) { + case DT_LNK: + alias->name = strdup(de->d_name); + count = readlink(de->d_name, buffer, sizeof(buffer)-1); + + if (count < 0) + fatal("Cannot read symlink %s\n", de->d_name); + + buffer[count] = 0; + p = buffer + count; + while (p > buffer && p[-1] != '/') + p--; + alias->ref = strdup(p); + alias++; + break; + case DT_DIR: + if (chdir(de->d_name)) + fatal("Unable to access slab %s\n", slab->name); + slab->name = strdup(de->d_name); + slab->alias = 0; + slab->refs = 0; + slab->aliases = get_obj("aliases"); + slab->align = get_obj("align"); + slab->cache_dma = get_obj("cache_dma"); + slab->cpu_slabs = get_obj("cpu_slabs"); + slab->destroy_by_rcu = get_obj("destroy_by_rcu"); + slab->hwcache_align = get_obj("hwcache_align"); + slab->object_size = get_obj("object_size"); + slab->objects = get_obj("objects"); + slab->objects_partial = get_obj("objects_partial"); + slab->objects_total = get_obj("objects_total"); + slab->objs_per_slab = get_obj("objs_per_slab"); + slab->order = get_obj("order"); + slab->partial = get_obj("partial"); + slab->partial = get_obj_and_str("partial", &t); + decode_numa_list(slab->numa_partial, t); + free(t); + slab->poison = get_obj("poison"); + slab->reclaim_account = get_obj("reclaim_account"); + slab->red_zone = get_obj("red_zone"); + slab->sanity_checks = get_obj("sanity_checks"); + slab->slab_size = get_obj("slab_size"); + slab->slabs = get_obj_and_str("slabs", &t); + decode_numa_list(slab->numa, t); + free(t); + slab->store_user = get_obj("store_user"); + slab->trace = get_obj("trace"); + slab->alloc_fastpath = get_obj("alloc_fastpath"); + slab->alloc_slowpath = get_obj("alloc_slowpath"); + slab->free_fastpath = get_obj("free_fastpath"); + slab->free_slowpath = get_obj("free_slowpath"); + slab->free_frozen= get_obj("free_frozen"); + slab->free_add_partial = get_obj("free_add_partial"); + slab->free_remove_partial = get_obj("free_remove_partial"); + slab->alloc_from_partial = get_obj("alloc_from_partial"); + slab->alloc_slab = get_obj("alloc_slab"); + slab->alloc_refill = get_obj("alloc_refill"); + slab->free_slab = get_obj("free_slab"); + slab->cpuslab_flush = get_obj("cpuslab_flush"); + slab->deactivate_full = get_obj("deactivate_full"); + slab->deactivate_empty = get_obj("deactivate_empty"); + slab->deactivate_to_head = get_obj("deactivate_to_head"); + slab->deactivate_to_tail = get_obj("deactivate_to_tail"); + slab->deactivate_remote_frees = get_obj("deactivate_remote_frees"); + slab->order_fallback = get_obj("order_fallback"); + slab->cmpxchg_double_cpu_fail = get_obj("cmpxchg_double_cpu_fail"); + slab->cmpxchg_double_fail = get_obj("cmpxchg_double_fail"); + slab->cpu_partial_alloc = get_obj("cpu_partial_alloc"); + slab->cpu_partial_free = get_obj("cpu_partial_free"); + slab->alloc_node_mismatch = get_obj("alloc_node_mismatch"); + slab->deactivate_bypass = get_obj("deactivate_bypass"); + chdir(".."); + if (slab->name[0] == ':') + alias_targets++; + slab++; + break; + default : + fatal("Unknown file type %lx\n", de->d_type); + } + } + closedir(dir); + slabs = slab - slabinfo; + actual_slabs = slabs; + aliases = alias - aliasinfo; + if (slabs > MAX_SLABS) + fatal("Too many slabs\n"); + if (aliases > MAX_ALIASES) + fatal("Too many aliases\n"); +} + +static void output_slabs(void) +{ + struct slabinfo *slab; + int lines = output_lines; + + for (slab = slabinfo; (slab < slabinfo + slabs) && + lines != 0; slab++) { + + if (slab->alias) + continue; + + if (lines != -1) + lines--; + + if (show_numa) + slab_numa(slab, 0); + else if (show_track) + show_tracking(slab); + else if (validate) + slab_validate(slab); + else if (shrink) + slab_shrink(slab); + else if (set_debug) + slab_debug(slab); + else if (show_ops) + ops(slab); + else if (show_slab) + slabcache(slab); + else if (show_report) + report(slab); + } +} + +static void _xtotals(char *heading, char *underline, + int loss, int size, int partial) +{ + printf("%s%s", heading, underline); + line = 0; + sort_loss = loss; + sort_size = size; + sort_partial = partial; + sort_slabs(); + output_slabs(); +} + +static void xtotals(void) +{ + char *heading, *underline; + + totals(); + + link_slabs(); + rename_slabs(); + + heading = "\nSlabs sorted by size\n"; + underline = "--------------------\n"; + _xtotals(heading, underline, 0, 1, 0); + + heading = "\nSlabs sorted by loss\n"; + underline = "--------------------\n"; + _xtotals(heading, underline, 1, 0, 0); + + heading = "\nSlabs sorted by number of partial slabs\n"; + underline = "---------------------------------------\n"; + _xtotals(heading, underline, 0, 0, 1); + + printf("\n"); +} + +struct option opts[] = { + { "aliases", no_argument, NULL, 'a' }, + { "activity", no_argument, NULL, 'A' }, + { "Bytes", no_argument, NULL, 'B'}, + { "debug", optional_argument, NULL, 'd' }, + { "display-activity", no_argument, NULL, 'D' }, + { "empty", no_argument, NULL, 'e' }, + { "first-alias", no_argument, NULL, 'f' }, + { "help", no_argument, NULL, 'h' }, + { "inverted", no_argument, NULL, 'i'}, + { "slabs", no_argument, NULL, 'l' }, + { "Loss", no_argument, NULL, 'L'}, + { "numa", no_argument, NULL, 'n' }, + { "lines", required_argument, NULL, 'N'}, + { "ops", no_argument, NULL, 'o' }, + { "partial", no_argument, NULL, 'p'}, + { "report", no_argument, NULL, 'r' }, + { "shrink", no_argument, NULL, 's' }, + { "Size", no_argument, NULL, 'S'}, + { "tracking", no_argument, NULL, 't'}, + { "Totals", no_argument, NULL, 'T'}, + { "Unreclaim", no_argument, NULL, 'U'}, + { "validate", no_argument, NULL, 'v' }, + { "Xtotals", no_argument, NULL, 'X'}, + { "zero", no_argument, NULL, 'z' }, + { "1ref", no_argument, NULL, '1'}, + { NULL, 0, NULL, 0 } +}; + +int main(int argc, char *argv[]) +{ + int c; + int err; + char *pattern_source; + + page_size = getpagesize(); + + while ((c = getopt_long(argc, argv, "aABd::DefhilLnN:oPrsStTUvXz1", + opts, NULL)) != -1) + switch (c) { + case 'a': + show_alias = 1; + break; + case 'A': + sort_active = 1; + break; + case 'B': + show_bytes = 1; + break; + case 'd': + set_debug = 1; + if (!debug_opt_scan(optarg)) + fatal("Invalid debug option '%s'\n", optarg); + break; + case 'D': + show_activity = 1; + break; + case 'e': + show_empty = 1; + break; + case 'f': + show_first_alias = 1; + break; + case 'h': + usage(); + return 0; + case 'i': + show_inverted = 1; + break; + case 'l': + show_slab = 1; + break; + case 'L': + sort_loss = 1; + break; + case 'n': + show_numa = 1; + break; + case 'N': + if (optarg) { + output_lines = atoi(optarg); + if (output_lines < 1) + output_lines = 1; + } + break; + case 'o': + show_ops = 1; + break; + case 'r': + show_report = 1; + break; + case 'P': + sort_partial = 1; + break; + case 's': + shrink = 1; + break; + case 'S': + sort_size = 1; + break; + case 't': + show_track = 1; + break; + case 'T': + show_totals = 1; + break; + case 'U': + unreclaim_only = 1; + break; + case 'v': + validate = 1; + break; + case 'X': + if (output_lines == -1) + output_lines = 1; + extended_totals = 1; + show_bytes = 1; + break; + case 'z': + skip_zero = 0; + break; + case '1': + show_single_ref = 1; + break; + default: + fatal("%s: Invalid option '%c'\n", argv[0], optopt); + + } + + if (!show_slab && !show_alias && !show_track && !show_report + && !validate && !shrink && !set_debug && !show_ops) + show_slab = 1; + + if (argc > optind) + pattern_source = argv[optind]; + else + pattern_source = ".*"; + + err = regcomp(&pattern, pattern_source, REG_ICASE|REG_NOSUB); + if (err) + fatal("%s: Invalid pattern '%s' code %d\n", + argv[0], pattern_source, err); + read_slab_dir(); + if (show_alias) { + alias(); + } else if (extended_totals) { + xtotals(); + } else if (show_totals) { + totals(); + } else { + link_slabs(); + rename_slabs(); + sort_slabs(); + output_slabs(); + } + return 0; +} diff --git a/tools/vm/.gitignore b/tools/vm/.gitignore deleted file mode 100644 index 922879f93fc8..000000000000 --- a/tools/vm/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -slabinfo -page-types -page_owner_sort diff --git a/tools/vm/Makefile b/tools/vm/Makefile deleted file mode 100644 index 9860622cbb15..000000000000 --- a/tools/vm/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# Makefile for vm tools -# -include ../scripts/Makefile.include - -TARGETS=page-types slabinfo page_owner_sort - -LIB_DIR = ../lib/api -LIBS = $(LIB_DIR)/libapi.a - -CFLAGS = -Wall -Wextra -I../lib/ -LDFLAGS = $(LIBS) - -all: $(TARGETS) - -$(TARGETS): $(LIBS) - -$(LIBS): - make -C $(LIB_DIR) - -%: %.c - $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) - -clean: - $(RM) page-types slabinfo page_owner_sort - make -C $(LIB_DIR) clean - -sbindir ?= /usr/sbin - -install: all - install -d $(DESTDIR)$(sbindir) - install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir) diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c deleted file mode 100644 index 381dcc00cb62..000000000000 --- a/tools/vm/page-types.c +++ /dev/null @@ -1,1396 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * page-types: Tool for querying page flags - * - * Copyright (C) 2009 Intel corporation - * - * Authors: Wu Fengguang - */ - -#define _FILE_OFFSET_BITS 64 -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "../../include/uapi/linux/magic.h" -#include "../../include/uapi/linux/kernel-page-flags.h" -#include - -#ifndef MAX_PATH -# define MAX_PATH 256 -#endif - -#ifndef STR -# define _STR(x) #x -# define STR(x) _STR(x) -#endif - -/* - * pagemap kernel ABI bits - */ - -#define PM_ENTRY_BYTES 8 -#define PM_PFRAME_BITS 55 -#define PM_PFRAME_MASK ((1LL << PM_PFRAME_BITS) - 1) -#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) -#define MAX_SWAPFILES_SHIFT 5 -#define PM_SWAP_OFFSET(x) (((x) & PM_PFRAME_MASK) >> MAX_SWAPFILES_SHIFT) -#define PM_SOFT_DIRTY (1ULL << 55) -#define PM_MMAP_EXCLUSIVE (1ULL << 56) -#define PM_FILE (1ULL << 61) -#define PM_SWAP (1ULL << 62) -#define PM_PRESENT (1ULL << 63) - -/* - * kernel page flags - */ - -#define KPF_BYTES 8 -#define PROC_KPAGEFLAGS "/proc/kpageflags" -#define PROC_KPAGECOUNT "/proc/kpagecount" -#define PROC_KPAGECGROUP "/proc/kpagecgroup" - -#define SYS_KERNEL_MM_PAGE_IDLE "/sys/kernel/mm/page_idle/bitmap" - -/* [32-] kernel hacking assistances */ -#define KPF_RESERVED 32 -#define KPF_MLOCKED 33 -#define KPF_MAPPEDTODISK 34 -#define KPF_PRIVATE 35 -#define KPF_PRIVATE_2 36 -#define KPF_OWNER_PRIVATE 37 -#define KPF_ARCH 38 -#define KPF_UNCACHED 39 -#define KPF_SOFTDIRTY 40 -#define KPF_ARCH_2 41 - -/* [47-] take some arbitrary free slots for expanding overloaded flags - * not part of kernel API - */ -#define KPF_ANON_EXCLUSIVE 47 -#define KPF_READAHEAD 48 -#define KPF_SLOB_FREE 49 -#define KPF_SLUB_FROZEN 50 -#define KPF_SLUB_DEBUG 51 -#define KPF_FILE 61 -#define KPF_SWAP 62 -#define KPF_MMAP_EXCLUSIVE 63 - -#define KPF_ALL_BITS ((uint64_t)~0ULL) -#define KPF_HACKERS_BITS (0xffffULL << 32) -#define KPF_OVERLOADED_BITS (0xffffULL << 48) -#define BIT(name) (1ULL << KPF_##name) -#define BITS_COMPOUND (BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL)) - -static const char * const page_flag_names[] = { - [KPF_LOCKED] = "L:locked", - [KPF_ERROR] = "E:error", - [KPF_REFERENCED] = "R:referenced", - [KPF_UPTODATE] = "U:uptodate", - [KPF_DIRTY] = "D:dirty", - [KPF_LRU] = "l:lru", - [KPF_ACTIVE] = "A:active", - [KPF_SLAB] = "S:slab", - [KPF_WRITEBACK] = "W:writeback", - [KPF_RECLAIM] = "I:reclaim", - [KPF_BUDDY] = "B:buddy", - - [KPF_MMAP] = "M:mmap", - [KPF_ANON] = "a:anonymous", - [KPF_SWAPCACHE] = "s:swapcache", - [KPF_SWAPBACKED] = "b:swapbacked", - [KPF_COMPOUND_HEAD] = "H:compound_head", - [KPF_COMPOUND_TAIL] = "T:compound_tail", - [KPF_HUGE] = "G:huge", - [KPF_UNEVICTABLE] = "u:unevictable", - [KPF_HWPOISON] = "X:hwpoison", - [KPF_NOPAGE] = "n:nopage", - [KPF_KSM] = "x:ksm", - [KPF_THP] = "t:thp", - [KPF_OFFLINE] = "o:offline", - [KPF_PGTABLE] = "g:pgtable", - [KPF_ZERO_PAGE] = "z:zero_page", - [KPF_IDLE] = "i:idle_page", - - [KPF_RESERVED] = "r:reserved", - [KPF_MLOCKED] = "m:mlocked", - [KPF_MAPPEDTODISK] = "d:mappedtodisk", - [KPF_PRIVATE] = "P:private", - [KPF_PRIVATE_2] = "p:private_2", - [KPF_OWNER_PRIVATE] = "O:owner_private", - [KPF_ARCH] = "h:arch", - [KPF_UNCACHED] = "c:uncached", - [KPF_SOFTDIRTY] = "f:softdirty", - [KPF_ARCH_2] = "H:arch_2", - - [KPF_ANON_EXCLUSIVE] = "d:anon_exclusive", - [KPF_READAHEAD] = "I:readahead", - [KPF_SLOB_FREE] = "P:slob_free", - [KPF_SLUB_FROZEN] = "A:slub_frozen", - [KPF_SLUB_DEBUG] = "E:slub_debug", - - [KPF_FILE] = "F:file", - [KPF_SWAP] = "w:swap", - [KPF_MMAP_EXCLUSIVE] = "1:mmap_exclusive", -}; - - -/* - * data structures - */ - -static int opt_raw; /* for kernel developers */ -static int opt_list; /* list pages (in ranges) */ -static int opt_mark_idle; /* set accessed bit */ -static int opt_no_summary; /* don't show summary */ -static pid_t opt_pid; /* process to walk */ -const char *opt_file; /* file or directory path */ -static uint64_t opt_cgroup; /* cgroup inode */ -static int opt_list_cgroup;/* list page cgroup */ -static int opt_list_mapcnt;/* list page map count */ -static const char *opt_kpageflags;/* kpageflags file to parse */ - -#define MAX_ADDR_RANGES 1024 -static int nr_addr_ranges; -static unsigned long opt_offset[MAX_ADDR_RANGES]; -static unsigned long opt_size[MAX_ADDR_RANGES]; - -#define MAX_VMAS 10240 -static int nr_vmas; -static unsigned long pg_start[MAX_VMAS]; -static unsigned long pg_end[MAX_VMAS]; - -#define MAX_BIT_FILTERS 64 -static int nr_bit_filters; -static uint64_t opt_mask[MAX_BIT_FILTERS]; -static uint64_t opt_bits[MAX_BIT_FILTERS]; - -static int page_size; - -static int pagemap_fd; -static int kpageflags_fd; -static int kpagecount_fd = -1; -static int kpagecgroup_fd = -1; -static int page_idle_fd = -1; - -static int opt_hwpoison; -static int opt_unpoison; - -static const char *hwpoison_debug_fs; -static int hwpoison_inject_fd; -static int hwpoison_forget_fd; - -#define HASH_SHIFT 13 -#define HASH_SIZE (1 << HASH_SHIFT) -#define HASH_MASK (HASH_SIZE - 1) -#define HASH_KEY(flags) (flags & HASH_MASK) - -static unsigned long total_pages; -static unsigned long nr_pages[HASH_SIZE]; -static uint64_t page_flags[HASH_SIZE]; - - -/* - * helper functions - */ - -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) - -#define min_t(type, x, y) ({ \ - type __min1 = (x); \ - type __min2 = (y); \ - __min1 < __min2 ? __min1 : __min2; }) - -#define max_t(type, x, y) ({ \ - type __max1 = (x); \ - type __max2 = (y); \ - __max1 > __max2 ? __max1 : __max2; }) - -static unsigned long pages2mb(unsigned long pages) -{ - return (pages * page_size) >> 20; -} - -static void fatal(const char *x, ...) -{ - va_list ap; - - va_start(ap, x); - vfprintf(stderr, x, ap); - va_end(ap); - exit(EXIT_FAILURE); -} - -static int checked_open(const char *pathname, int flags) -{ - int fd = open(pathname, flags); - - if (fd < 0) { - perror(pathname); - exit(EXIT_FAILURE); - } - - return fd; -} - -/* - * pagemap/kpageflags routines - */ - -static unsigned long do_u64_read(int fd, const char *name, - uint64_t *buf, - unsigned long index, - unsigned long count) -{ - long bytes; - - if (index > ULONG_MAX / 8) - fatal("index overflow: %lu\n", index); - - bytes = pread(fd, buf, count * 8, (off_t)index * 8); - if (bytes < 0) { - perror(name); - exit(EXIT_FAILURE); - } - if (bytes % 8) - fatal("partial read: %lu bytes\n", bytes); - - return bytes / 8; -} - -static unsigned long kpageflags_read(uint64_t *buf, - unsigned long index, - unsigned long pages) -{ - return do_u64_read(kpageflags_fd, opt_kpageflags, buf, index, pages); -} - -static unsigned long kpagecgroup_read(uint64_t *buf, - unsigned long index, - unsigned long pages) -{ - if (kpagecgroup_fd < 0) - return pages; - - return do_u64_read(kpagecgroup_fd, opt_kpageflags, buf, index, pages); -} - -static unsigned long kpagecount_read(uint64_t *buf, - unsigned long index, - unsigned long pages) -{ - return kpagecount_fd < 0 ? pages : - do_u64_read(kpagecount_fd, PROC_KPAGECOUNT, - buf, index, pages); -} - -static unsigned long pagemap_read(uint64_t *buf, - unsigned long index, - unsigned long pages) -{ - return do_u64_read(pagemap_fd, "/proc/pid/pagemap", buf, index, pages); -} - -static unsigned long pagemap_pfn(uint64_t val) -{ - unsigned long pfn; - - if (val & PM_PRESENT) - pfn = PM_PFRAME(val); - else - pfn = 0; - - return pfn; -} - -static unsigned long pagemap_swap_offset(uint64_t val) -{ - return val & PM_SWAP ? PM_SWAP_OFFSET(val) : 0; -} - -/* - * page flag names - */ - -static char *page_flag_name(uint64_t flags) -{ - static char buf[65]; - int present; - size_t i, j; - - for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) { - present = (flags >> i) & 1; - if (!page_flag_names[i]) { - if (present) - fatal("unknown flag bit %d\n", i); - continue; - } - buf[j++] = present ? page_flag_names[i][0] : '_'; - } - - return buf; -} - -static char *page_flag_longname(uint64_t flags) -{ - static char buf[1024]; - size_t i, n; - - for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) { - if (!page_flag_names[i]) - continue; - if ((flags >> i) & 1) - n += snprintf(buf + n, sizeof(buf) - n, "%s,", - page_flag_names[i] + 2); - } - if (n) - n--; - buf[n] = '\0'; - - return buf; -} - - -/* - * page list and summary - */ - -static void show_page_range(unsigned long voffset, unsigned long offset, - unsigned long size, uint64_t flags, - uint64_t cgroup, uint64_t mapcnt) -{ - static uint64_t flags0; - static uint64_t cgroup0; - static uint64_t mapcnt0; - static unsigned long voff; - static unsigned long index; - static unsigned long count; - - if (flags == flags0 && cgroup == cgroup0 && mapcnt == mapcnt0 && - offset == index + count && size && voffset == voff + count) { - count += size; - return; - } - - if (count) { - if (opt_pid) - printf("%lx\t", voff); - if (opt_file) - printf("%lx\t", voff); - if (opt_list_cgroup) - printf("@%llu\t", (unsigned long long)cgroup0); - if (opt_list_mapcnt) - printf("%lu\t", mapcnt0); - printf("%lx\t%lx\t%s\n", - index, count, page_flag_name(flags0)); - } - - flags0 = flags; - cgroup0 = cgroup; - mapcnt0 = mapcnt; - index = offset; - voff = voffset; - count = size; -} - -static void flush_page_range(void) -{ - show_page_range(0, 0, 0, 0, 0, 0); -} - -static void show_page(unsigned long voffset, unsigned long offset, - uint64_t flags, uint64_t cgroup, uint64_t mapcnt) -{ - if (opt_pid) - printf("%lx\t", voffset); - if (opt_file) - printf("%lx\t", voffset); - if (opt_list_cgroup) - printf("@%llu\t", (unsigned long long)cgroup); - if (opt_list_mapcnt) - printf("%lu\t", mapcnt); - - printf("%lx\t%s\n", offset, page_flag_name(flags)); -} - -static void show_summary(void) -{ - size_t i; - - printf(" flags\tpage-count MB" - " symbolic-flags\t\t\tlong-symbolic-flags\n"); - - for (i = 0; i < ARRAY_SIZE(nr_pages); i++) { - if (nr_pages[i]) - printf("0x%016llx\t%10lu %8lu %s\t%s\n", - (unsigned long long)page_flags[i], - nr_pages[i], - pages2mb(nr_pages[i]), - page_flag_name(page_flags[i]), - page_flag_longname(page_flags[i])); - } - - printf(" total\t%10lu %8lu\n", - total_pages, pages2mb(total_pages)); -} - - -/* - * page flag filters - */ - -static int bit_mask_ok(uint64_t flags) -{ - int i; - - for (i = 0; i < nr_bit_filters; i++) { - if (opt_bits[i] == KPF_ALL_BITS) { - if ((flags & opt_mask[i]) == 0) - return 0; - } else { - if ((flags & opt_mask[i]) != opt_bits[i]) - return 0; - } - } - - return 1; -} - -static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme) -{ - /* Anonymous pages overload PG_mappedtodisk */ - if ((flags & BIT(ANON)) && (flags & BIT(MAPPEDTODISK))) - flags ^= BIT(MAPPEDTODISK) | BIT(ANON_EXCLUSIVE); - - /* SLOB/SLUB overload several page flags */ - if (flags & BIT(SLAB)) { - if (flags & BIT(PRIVATE)) - flags ^= BIT(PRIVATE) | BIT(SLOB_FREE); - if (flags & BIT(ACTIVE)) - flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN); - if (flags & BIT(ERROR)) - flags ^= BIT(ERROR) | BIT(SLUB_DEBUG); - } - - /* PG_reclaim is overloaded as PG_readahead in the read path */ - if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM)) - flags ^= BIT(RECLAIM) | BIT(READAHEAD); - - if (pme & PM_SOFT_DIRTY) - flags |= BIT(SOFTDIRTY); - if (pme & PM_FILE) - flags |= BIT(FILE); - if (pme & PM_SWAP) - flags |= BIT(SWAP); - if (pme & PM_MMAP_EXCLUSIVE) - flags |= BIT(MMAP_EXCLUSIVE); - - return flags; -} - -static uint64_t well_known_flags(uint64_t flags) -{ - /* hide flags intended only for kernel hacker */ - flags &= ~KPF_HACKERS_BITS; - - /* hide non-hugeTLB compound pages */ - if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE))) - flags &= ~BITS_COMPOUND; - - return flags; -} - -static uint64_t kpageflags_flags(uint64_t flags, uint64_t pme) -{ - if (opt_raw) - flags = expand_overloaded_flags(flags, pme); - else - flags = well_known_flags(flags); - - return flags; -} - -/* - * page actions - */ - -static void prepare_hwpoison_fd(void) -{ - char buf[MAX_PATH + 1]; - - hwpoison_debug_fs = debugfs__mount(); - if (!hwpoison_debug_fs) { - perror("mount debugfs"); - exit(EXIT_FAILURE); - } - - if (opt_hwpoison && !hwpoison_inject_fd) { - snprintf(buf, MAX_PATH, "%s/hwpoison/corrupt-pfn", - hwpoison_debug_fs); - hwpoison_inject_fd = checked_open(buf, O_WRONLY); - } - - if (opt_unpoison && !hwpoison_forget_fd) { - snprintf(buf, MAX_PATH, "%s/hwpoison/unpoison-pfn", - hwpoison_debug_fs); - hwpoison_forget_fd = checked_open(buf, O_WRONLY); - } -} - -static int hwpoison_page(unsigned long offset) -{ - char buf[100]; - int len; - - len = sprintf(buf, "0x%lx\n", offset); - len = write(hwpoison_inject_fd, buf, len); - if (len < 0) { - perror("hwpoison inject"); - return len; - } - return 0; -} - -static int unpoison_page(unsigned long offset) -{ - char buf[100]; - int len; - - len = sprintf(buf, "0x%lx\n", offset); - len = write(hwpoison_forget_fd, buf, len); - if (len < 0) { - perror("hwpoison forget"); - return len; - } - return 0; -} - -static int mark_page_idle(unsigned long offset) -{ - static unsigned long off; - static uint64_t buf; - int len; - - if ((offset / 64 == off / 64) || buf == 0) { - buf |= 1UL << (offset % 64); - off = offset; - return 0; - } - - len = pwrite(page_idle_fd, &buf, 8, 8 * (off / 64)); - if (len < 0) { - perror("mark page idle"); - return len; - } - - buf = 1UL << (offset % 64); - off = offset; - - return 0; -} - -/* - * page frame walker - */ - -static size_t hash_slot(uint64_t flags) -{ - size_t k = HASH_KEY(flags); - size_t i; - - /* Explicitly reserve slot 0 for flags 0: the following logic - * cannot distinguish an unoccupied slot from slot (flags==0). - */ - if (flags == 0) - return 0; - - /* search through the remaining (HASH_SIZE-1) slots */ - for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) { - if (!k || k >= ARRAY_SIZE(page_flags)) - k = 1; - if (page_flags[k] == 0) { - page_flags[k] = flags; - return k; - } - if (page_flags[k] == flags) - return k; - } - - fatal("hash table full: bump up HASH_SHIFT?\n"); - exit(EXIT_FAILURE); -} - -static void add_page(unsigned long voffset, unsigned long offset, - uint64_t flags, uint64_t cgroup, uint64_t mapcnt, - uint64_t pme) -{ - flags = kpageflags_flags(flags, pme); - - if (!bit_mask_ok(flags)) - return; - - if (opt_cgroup && cgroup != (uint64_t)opt_cgroup) - return; - - if (opt_hwpoison) - hwpoison_page(offset); - if (opt_unpoison) - unpoison_page(offset); - - if (opt_mark_idle) - mark_page_idle(offset); - - if (opt_list == 1) - show_page_range(voffset, offset, 1, flags, cgroup, mapcnt); - else if (opt_list == 2) - show_page(voffset, offset, flags, cgroup, mapcnt); - - nr_pages[hash_slot(flags)]++; - total_pages++; -} - -#define KPAGEFLAGS_BATCH (64 << 10) /* 64k pages */ -static void walk_pfn(unsigned long voffset, - unsigned long index, - unsigned long count, - uint64_t pme) -{ - uint64_t buf[KPAGEFLAGS_BATCH]; - uint64_t cgi[KPAGEFLAGS_BATCH]; - uint64_t cnt[KPAGEFLAGS_BATCH]; - unsigned long batch; - unsigned long pages; - unsigned long i; - - /* - * kpagecgroup_read() reads only if kpagecgroup were opened, but - * /proc/kpagecgroup might even not exist, so it's better to fill - * them with zeros here. - */ - if (count == 1) - cgi[0] = 0; - else - memset(cgi, 0, sizeof cgi); - - while (count) { - batch = min_t(unsigned long, count, KPAGEFLAGS_BATCH); - pages = kpageflags_read(buf, index, batch); - if (pages == 0) - break; - - if (kpagecgroup_read(cgi, index, pages) != pages) - fatal("kpagecgroup returned fewer pages than expected"); - - if (kpagecount_read(cnt, index, pages) != pages) - fatal("kpagecount returned fewer pages than expected"); - - for (i = 0; i < pages; i++) - add_page(voffset + i, index + i, - buf[i], cgi[i], cnt[i], pme); - - index += pages; - count -= pages; - } -} - -static void walk_swap(unsigned long voffset, uint64_t pme) -{ - uint64_t flags = kpageflags_flags(0, pme); - - if (!bit_mask_ok(flags)) - return; - - if (opt_cgroup) - return; - - if (opt_list == 1) - show_page_range(voffset, pagemap_swap_offset(pme), - 1, flags, 0, 0); - else if (opt_list == 2) - show_page(voffset, pagemap_swap_offset(pme), flags, 0, 0); - - nr_pages[hash_slot(flags)]++; - total_pages++; -} - -#define PAGEMAP_BATCH (64 << 10) -static void walk_vma(unsigned long index, unsigned long count) -{ - uint64_t buf[PAGEMAP_BATCH]; - unsigned long batch; - unsigned long pages; - unsigned long pfn; - unsigned long i; - - while (count) { - batch = min_t(unsigned long, count, PAGEMAP_BATCH); - pages = pagemap_read(buf, index, batch); - if (pages == 0) - break; - - for (i = 0; i < pages; i++) { - pfn = pagemap_pfn(buf[i]); - if (pfn) - walk_pfn(index + i, pfn, 1, buf[i]); - if (buf[i] & PM_SWAP) - walk_swap(index + i, buf[i]); - } - - index += pages; - count -= pages; - } -} - -static void walk_task(unsigned long index, unsigned long count) -{ - const unsigned long end = index + count; - unsigned long start; - int i = 0; - - while (index < end) { - - while (pg_end[i] <= index) - if (++i >= nr_vmas) - return; - if (pg_start[i] >= end) - return; - - start = max_t(unsigned long, pg_start[i], index); - index = min_t(unsigned long, pg_end[i], end); - - assert(start < index); - walk_vma(start, index - start); - } -} - -static void add_addr_range(unsigned long offset, unsigned long size) -{ - if (nr_addr_ranges >= MAX_ADDR_RANGES) - fatal("too many addr ranges\n"); - - opt_offset[nr_addr_ranges] = offset; - opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset); - nr_addr_ranges++; -} - -static void walk_addr_ranges(void) -{ - int i; - - kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY); - - if (!nr_addr_ranges) - add_addr_range(0, ULONG_MAX); - - for (i = 0; i < nr_addr_ranges; i++) - if (!opt_pid) - walk_pfn(opt_offset[i], opt_offset[i], opt_size[i], 0); - else - walk_task(opt_offset[i], opt_size[i]); - - if (opt_mark_idle) - mark_page_idle(0); - - close(kpageflags_fd); -} - - -/* - * user interface - */ - -static const char *page_flag_type(uint64_t flag) -{ - if (flag & KPF_HACKERS_BITS) - return "(r)"; - if (flag & KPF_OVERLOADED_BITS) - return "(o)"; - return " "; -} - -static void usage(void) -{ - size_t i, j; - - printf( -"page-types [options]\n" -" -r|--raw Raw mode, for kernel developers\n" -" -d|--describe flags Describe flags\n" -" -a|--addr addr-spec Walk a range of pages\n" -" -b|--bits bits-spec Walk pages with specified bits\n" -" -c|--cgroup path|@inode Walk pages within memory cgroup\n" -" -p|--pid pid Walk process address space\n" -" -f|--file filename Walk file address space\n" -" -i|--mark-idle Mark pages idle\n" -" -l|--list Show page details in ranges\n" -" -L|--list-each Show page details one by one\n" -" -C|--list-cgroup Show cgroup inode for pages\n" -" -M|--list-mapcnt Show page map count\n" -" -N|--no-summary Don't show summary info\n" -" -X|--hwpoison hwpoison pages\n" -" -x|--unpoison unpoison pages\n" -" -F|--kpageflags filename kpageflags file to parse\n" -" -h|--help Show this usage message\n" -"flags:\n" -" 0x10 bitfield format, e.g.\n" -" anon bit-name, e.g.\n" -" 0x10,anon comma-separated list, e.g.\n" -"addr-spec:\n" -" N one page at offset N (unit: pages)\n" -" N+M pages range from N to N+M-1\n" -" N,M pages range from N to M-1\n" -" N, pages range from N to end\n" -" ,M pages range from 0 to M-1\n" -"bits-spec:\n" -" bit1,bit2 (flags & (bit1|bit2)) != 0\n" -" bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n" -" bit1,~bit2 (flags & (bit1|bit2)) == bit1\n" -" =bit1,bit2 flags == (bit1|bit2)\n" -"bit-names:\n" - ); - - for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) { - if (!page_flag_names[i]) - continue; - printf("%16s%s", page_flag_names[i] + 2, - page_flag_type(1ULL << i)); - if (++j > 3) { - j = 0; - putchar('\n'); - } - } - printf("\n " - "(r) raw mode bits (o) overloaded bits\n"); -} - -static unsigned long long parse_number(const char *str) -{ - unsigned long long n; - - n = strtoll(str, NULL, 0); - - if (n == 0 && str[0] != '0') - fatal("invalid name or number: %s\n", str); - - return n; -} - -static void parse_pid(const char *str) -{ - FILE *file; - char buf[5000]; - - opt_pid = parse_number(str); - - sprintf(buf, "/proc/%d/pagemap", opt_pid); - pagemap_fd = checked_open(buf, O_RDONLY); - - sprintf(buf, "/proc/%d/maps", opt_pid); - file = fopen(buf, "r"); - if (!file) { - perror(buf); - exit(EXIT_FAILURE); - } - - while (fgets(buf, sizeof(buf), file) != NULL) { - unsigned long vm_start; - unsigned long vm_end; - unsigned long long pgoff; - int major, minor; - char r, w, x, s; - unsigned long ino; - int n; - - n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu", - &vm_start, - &vm_end, - &r, &w, &x, &s, - &pgoff, - &major, &minor, - &ino); - if (n < 10) { - fprintf(stderr, "unexpected line: %s\n", buf); - continue; - } - pg_start[nr_vmas] = vm_start / page_size; - pg_end[nr_vmas] = vm_end / page_size; - if (++nr_vmas >= MAX_VMAS) { - fprintf(stderr, "too many VMAs\n"); - break; - } - } - fclose(file); -} - -static void show_file(const char *name, const struct stat *st) -{ - unsigned long long size = st->st_size; - char atime[64], mtime[64]; - long now = time(NULL); - - printf("%s\tInode: %u\tSize: %llu (%llu pages)\n", - name, (unsigned)st->st_ino, - size, (size + page_size - 1) / page_size); - - strftime(atime, sizeof(atime), "%c", localtime(&st->st_atime)); - strftime(mtime, sizeof(mtime), "%c", localtime(&st->st_mtime)); - - printf("Modify: %s (%ld seconds ago)\nAccess: %s (%ld seconds ago)\n", - mtime, now - st->st_mtime, - atime, now - st->st_atime); -} - -static sigjmp_buf sigbus_jmp; - -static void * volatile sigbus_addr; - -static void sigbus_handler(int sig, siginfo_t *info, void *ucontex) -{ - (void)sig; - (void)ucontex; - sigbus_addr = info ? info->si_addr : NULL; - siglongjmp(sigbus_jmp, 1); -} - -static struct sigaction sigbus_action = { - .sa_sigaction = sigbus_handler, - .sa_flags = SA_SIGINFO, -}; - -static void walk_file_range(const char *name, int fd, - unsigned long off, unsigned long end) -{ - uint8_t vec[PAGEMAP_BATCH]; - uint64_t buf[PAGEMAP_BATCH], flags; - uint64_t cgroup = 0; - uint64_t mapcnt = 0; - unsigned long nr_pages, pfn, i; - ssize_t len; - void *ptr; - int first = 1; - - for (; off < end; off += len) { - nr_pages = (end - off + page_size - 1) / page_size; - if (nr_pages > PAGEMAP_BATCH) - nr_pages = PAGEMAP_BATCH; - len = nr_pages * page_size; - - ptr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, off); - if (ptr == MAP_FAILED) - fatal("mmap failed: %s", name); - - /* determine cached pages */ - if (mincore(ptr, len, vec)) - fatal("mincore failed: %s", name); - - /* turn off readahead */ - if (madvise(ptr, len, MADV_RANDOM)) - fatal("madvice failed: %s", name); - - if (sigsetjmp(sigbus_jmp, 1)) { - end = off + sigbus_addr ? sigbus_addr - ptr : 0; - fprintf(stderr, "got sigbus at offset %lld: %s\n", - (long long)end, name); - goto got_sigbus; - } - - /* populate ptes */ - for (i = 0; i < nr_pages ; i++) { - if (vec[i] & 1) - (void)*(volatile int *)(ptr + i * page_size); - } -got_sigbus: - - /* turn off harvesting reference bits */ - if (madvise(ptr, len, MADV_SEQUENTIAL)) - fatal("madvice failed: %s", name); - - if (pagemap_read(buf, (unsigned long)ptr / page_size, - nr_pages) != nr_pages) - fatal("cannot read pagemap"); - - munmap(ptr, len); - - for (i = 0; i < nr_pages; i++) { - pfn = pagemap_pfn(buf[i]); - if (!pfn) - continue; - if (!kpageflags_read(&flags, pfn, 1)) - continue; - if (!kpagecgroup_read(&cgroup, pfn, 1)) - fatal("kpagecgroup_read failed"); - if (!kpagecount_read(&mapcnt, pfn, 1)) - fatal("kpagecount_read failed"); - if (first && opt_list) { - first = 0; - flush_page_range(); - } - add_page(off / page_size + i, pfn, - flags, cgroup, mapcnt, buf[i]); - } - } -} - -static void walk_file(const char *name, const struct stat *st) -{ - int i; - int fd; - - fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW); - - if (!nr_addr_ranges) - add_addr_range(0, st->st_size / page_size); - - for (i = 0; i < nr_addr_ranges; i++) - walk_file_range(name, fd, opt_offset[i] * page_size, - (opt_offset[i] + opt_size[i]) * page_size); - - close(fd); -} - -int walk_tree(const char *name, const struct stat *st, int type, struct FTW *f) -{ - (void)f; - switch (type) { - case FTW_F: - if (S_ISREG(st->st_mode)) - walk_file(name, st); - break; - case FTW_DNR: - fprintf(stderr, "cannot read dir: %s\n", name); - break; - } - return 0; -} - -struct stat st; - -static void walk_page_cache(void) -{ - kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY); - pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY); - sigaction(SIGBUS, &sigbus_action, NULL); - - if (stat(opt_file, &st)) - fatal("stat failed: %s\n", opt_file); - - if (S_ISREG(st.st_mode)) { - walk_file(opt_file, &st); - } else if (S_ISDIR(st.st_mode)) { - /* do not follow symlinks and mountpoints */ - if (nftw(opt_file, walk_tree, 64, FTW_MOUNT | FTW_PHYS) < 0) - fatal("nftw failed: %s\n", opt_file); - } else - fatal("unhandled file type: %s\n", opt_file); - - close(kpageflags_fd); - close(pagemap_fd); - signal(SIGBUS, SIG_DFL); -} - -static void parse_file(const char *name) -{ - opt_file = name; -} - -static void parse_cgroup(const char *path) -{ - if (path[0] == '@') { - opt_cgroup = parse_number(path + 1); - return; - } - - struct stat st; - - if (stat(path, &st)) - fatal("stat failed: %s: %m\n", path); - - if (!S_ISDIR(st.st_mode)) - fatal("cgroup supposed to be a directory: %s\n", path); - - opt_cgroup = st.st_ino; -} - -static void parse_addr_range(const char *optarg) -{ - unsigned long offset; - unsigned long size; - char *p; - - p = strchr(optarg, ','); - if (!p) - p = strchr(optarg, '+'); - - if (p == optarg) { - offset = 0; - size = parse_number(p + 1); - } else if (p) { - offset = parse_number(optarg); - if (p[1] == '\0') - size = ULONG_MAX; - else { - size = parse_number(p + 1); - if (*p == ',') { - if (size < offset) - fatal("invalid range: %lu,%lu\n", - offset, size); - size -= offset; - } - } - } else { - offset = parse_number(optarg); - size = 1; - } - - add_addr_range(offset, size); -} - -static void add_bits_filter(uint64_t mask, uint64_t bits) -{ - if (nr_bit_filters >= MAX_BIT_FILTERS) - fatal("too much bit filters\n"); - - opt_mask[nr_bit_filters] = mask; - opt_bits[nr_bit_filters] = bits; - nr_bit_filters++; -} - -static uint64_t parse_flag_name(const char *str, int len) -{ - size_t i; - - if (!*str || !len) - return 0; - - if (len <= 8 && !strncmp(str, "compound", len)) - return BITS_COMPOUND; - - for (i = 0; i < ARRAY_SIZE(page_flag_names); i++) { - if (!page_flag_names[i]) - continue; - if (!strncmp(str, page_flag_names[i] + 2, len)) - return 1ULL << i; - } - - return parse_number(str); -} - -static uint64_t parse_flag_names(const char *str, int all) -{ - const char *p = str; - uint64_t flags = 0; - - while (1) { - if (*p == ',' || *p == '=' || *p == '\0') { - if ((*str != '~') || (*str == '~' && all && *++str)) - flags |= parse_flag_name(str, p - str); - if (*p != ',') - break; - str = p + 1; - } - p++; - } - - return flags; -} - -static void parse_bits_mask(const char *optarg) -{ - uint64_t mask; - uint64_t bits; - const char *p; - - p = strchr(optarg, '='); - if (p == optarg) { - mask = KPF_ALL_BITS; - bits = parse_flag_names(p + 1, 0); - } else if (p) { - mask = parse_flag_names(optarg, 0); - bits = parse_flag_names(p + 1, 0); - } else if (strchr(optarg, '~')) { - mask = parse_flag_names(optarg, 1); - bits = parse_flag_names(optarg, 0); - } else { - mask = parse_flag_names(optarg, 0); - bits = KPF_ALL_BITS; - } - - add_bits_filter(mask, bits); -} - -static void parse_kpageflags(const char *name) -{ - opt_kpageflags = name; -} - -static void describe_flags(const char *optarg) -{ - uint64_t flags = parse_flag_names(optarg, 0); - - printf("0x%016llx\t%s\t%s\n", - (unsigned long long)flags, - page_flag_name(flags), - page_flag_longname(flags)); -} - -static const struct option opts[] = { - { "raw" , 0, NULL, 'r' }, - { "pid" , 1, NULL, 'p' }, - { "file" , 1, NULL, 'f' }, - { "addr" , 1, NULL, 'a' }, - { "bits" , 1, NULL, 'b' }, - { "cgroup" , 1, NULL, 'c' }, - { "describe" , 1, NULL, 'd' }, - { "mark-idle" , 0, NULL, 'i' }, - { "list" , 0, NULL, 'l' }, - { "list-each" , 0, NULL, 'L' }, - { "list-cgroup", 0, NULL, 'C' }, - { "list-mapcnt", 0, NULL, 'M' }, - { "no-summary", 0, NULL, 'N' }, - { "hwpoison" , 0, NULL, 'X' }, - { "unpoison" , 0, NULL, 'x' }, - { "kpageflags", 0, NULL, 'F' }, - { "help" , 0, NULL, 'h' }, - { NULL , 0, NULL, 0 } -}; - -int main(int argc, char *argv[]) -{ - int c; - - page_size = getpagesize(); - - while ((c = getopt_long(argc, argv, - "rp:f:a:b:d:c:CilLMNXxF:h", - opts, NULL)) != -1) { - switch (c) { - case 'r': - opt_raw = 1; - break; - case 'p': - parse_pid(optarg); - break; - case 'f': - parse_file(optarg); - break; - case 'a': - parse_addr_range(optarg); - break; - case 'b': - parse_bits_mask(optarg); - break; - case 'c': - parse_cgroup(optarg); - break; - case 'C': - opt_list_cgroup = 1; - break; - case 'd': - describe_flags(optarg); - exit(0); - case 'i': - opt_mark_idle = 1; - break; - case 'l': - opt_list = 1; - break; - case 'L': - opt_list = 2; - break; - case 'M': - opt_list_mapcnt = 1; - break; - case 'N': - opt_no_summary = 1; - break; - case 'X': - opt_hwpoison = 1; - prepare_hwpoison_fd(); - break; - case 'x': - opt_unpoison = 1; - prepare_hwpoison_fd(); - break; - case 'F': - parse_kpageflags(optarg); - break; - case 'h': - usage(); - exit(0); - default: - usage(); - exit(1); - } - } - - if (!opt_kpageflags) - opt_kpageflags = PROC_KPAGEFLAGS; - - if (opt_cgroup || opt_list_cgroup) - kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY); - - if (opt_list && opt_list_mapcnt) - kpagecount_fd = checked_open(PROC_KPAGECOUNT, O_RDONLY); - - if (opt_mark_idle) - page_idle_fd = checked_open(SYS_KERNEL_MM_PAGE_IDLE, O_RDWR); - - if (opt_list && opt_pid) - printf("voffset\t"); - if (opt_list && opt_file) - printf("foffset\t"); - if (opt_list && opt_list_cgroup) - printf("cgroup\t"); - if (opt_list && opt_list_mapcnt) - printf("map-cnt\t"); - - if (opt_list == 1) - printf("offset\tlen\tflags\n"); - if (opt_list == 2) - printf("offset\tflags\n"); - - if (opt_file) - walk_page_cache(); - else - walk_addr_ranges(); - - if (opt_list == 1) - flush_page_range(); - - if (opt_no_summary) - return 0; - - if (opt_list) - printf("\n\n"); - - if (opt_file) { - show_file(opt_file, &st); - printf("\n"); - } - - show_summary(); - - if (opt_list_mapcnt) - close(kpagecount_fd); - - if (page_idle_fd >= 0) - close(page_idle_fd); - - return 0; -} diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c deleted file mode 100644 index 7c2ac124cdc8..000000000000 --- a/tools/vm/page_owner_sort.c +++ /dev/null @@ -1,897 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * User-space helper to sort the output of /sys/kernel/debug/page_owner - * - * Example use: - * cat /sys/kernel/debug/page_owner > page_owner_full.txt - * ./page_owner_sort page_owner_full.txt sorted_page_owner.txt - * Or sort by total memory: - * ./page_owner_sort -m page_owner_full.txt sorted_page_owner.txt - * - * See Documentation/mm/page_owner.rst -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define bool int -#define true 1 -#define false 0 -#define TASK_COMM_LEN 16 - -struct block_list { - char *txt; - char *comm; // task command name - char *stacktrace; - __u64 ts_nsec; - __u64 free_ts_nsec; - int len; - int num; - int page_num; - pid_t pid; - pid_t tgid; - int allocator; -}; -enum FILTER_BIT { - FILTER_UNRELEASE = 1<<1, - FILTER_PID = 1<<2, - FILTER_TGID = 1<<3, - FILTER_COMM = 1<<4 -}; -enum CULL_BIT { - CULL_UNRELEASE = 1<<1, - CULL_PID = 1<<2, - CULL_TGID = 1<<3, - CULL_COMM = 1<<4, - CULL_STACKTRACE = 1<<5, - CULL_ALLOCATOR = 1<<6 -}; -enum ALLOCATOR_BIT { - ALLOCATOR_CMA = 1<<1, - ALLOCATOR_SLAB = 1<<2, - ALLOCATOR_VMALLOC = 1<<3, - ALLOCATOR_OTHERS = 1<<4 -}; -enum ARG_TYPE { - ARG_TXT, ARG_COMM, ARG_STACKTRACE, ARG_ALLOC_TS, ARG_FREE_TS, - ARG_CULL_TIME, ARG_PAGE_NUM, ARG_PID, ARG_TGID, ARG_UNKNOWN, ARG_FREE, - ARG_ALLOCATOR -}; -enum SORT_ORDER { - SORT_ASC = 1, - SORT_DESC = -1, -}; -struct filter_condition { - pid_t *pids; - pid_t *tgids; - char **comms; - int pids_size; - int tgids_size; - int comms_size; -}; -struct sort_condition { - int (**cmps)(const void *, const void *); - int *signs; - int size; -}; -static struct filter_condition fc; -static struct sort_condition sc; -static regex_t order_pattern; -static regex_t pid_pattern; -static regex_t tgid_pattern; -static regex_t comm_pattern; -static regex_t ts_nsec_pattern; -static regex_t free_ts_nsec_pattern; -static struct block_list *list; -static int list_size; -static int max_size; -static int cull; -static int filter; -static bool debug_on; - -static void set_single_cmp(int (*cmp)(const void *, const void *), int sign); - -int read_block(char *buf, char *ext_buf, int buf_size, FILE *fin) -{ - char *curr = buf, *const buf_end = buf + buf_size; - - while (buf_end - curr > 1 && fgets(curr, buf_end - curr, fin)) { - if (*curr == '\n') { /* empty line */ - return curr - buf; - } - if (!strncmp(curr, "PFN", 3)) { - strcpy(ext_buf, curr); - continue; - } - curr += strlen(curr); - } - - return -1; /* EOF or no space left in buf. */ -} - -static int compare_txt(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return strcmp(l1->txt, l2->txt); -} - -static int compare_stacktrace(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return strcmp(l1->stacktrace, l2->stacktrace); -} - -static int compare_num(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return l1->num - l2->num; -} - -static int compare_page_num(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return l1->page_num - l2->page_num; -} - -static int compare_pid(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return l1->pid - l2->pid; -} - -static int compare_tgid(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return l1->tgid - l2->tgid; -} - -static int compare_allocator(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return l1->allocator - l2->allocator; -} - -static int compare_comm(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return strcmp(l1->comm, l2->comm); -} - -static int compare_ts(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return l1->ts_nsec < l2->ts_nsec ? -1 : 1; -} - -static int compare_free_ts(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return l1->free_ts_nsec < l2->free_ts_nsec ? -1 : 1; -} - -static int compare_release(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - if (!l1->free_ts_nsec && !l2->free_ts_nsec) - return 0; - if (l1->free_ts_nsec && l2->free_ts_nsec) - return 0; - return l1->free_ts_nsec ? 1 : -1; -} - -static int compare_cull_condition(const void *p1, const void *p2) -{ - if (cull == 0) - return compare_txt(p1, p2); - if ((cull & CULL_STACKTRACE) && compare_stacktrace(p1, p2)) - return compare_stacktrace(p1, p2); - if ((cull & CULL_PID) && compare_pid(p1, p2)) - return compare_pid(p1, p2); - if ((cull & CULL_TGID) && compare_tgid(p1, p2)) - return compare_tgid(p1, p2); - if ((cull & CULL_COMM) && compare_comm(p1, p2)) - return compare_comm(p1, p2); - if ((cull & CULL_UNRELEASE) && compare_release(p1, p2)) - return compare_release(p1, p2); - if ((cull & CULL_ALLOCATOR) && compare_allocator(p1, p2)) - return compare_allocator(p1, p2); - return 0; -} - -static int compare_sort_condition(const void *p1, const void *p2) -{ - int cmp = 0; - - for (int i = 0; i < sc.size; ++i) - if (cmp == 0) - cmp = sc.signs[i] * sc.cmps[i](p1, p2); - return cmp; -} - -static int search_pattern(regex_t *pattern, char *pattern_str, char *buf) -{ - int err, val_len; - regmatch_t pmatch[2]; - - err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL); - if (err != 0 || pmatch[1].rm_so == -1) { - if (debug_on) - fprintf(stderr, "no matching pattern in %s\n", buf); - return -1; - } - val_len = pmatch[1].rm_eo - pmatch[1].rm_so; - - memcpy(pattern_str, buf + pmatch[1].rm_so, val_len); - - return 0; -} - -static bool check_regcomp(regex_t *pattern, const char *regex) -{ - int err; - - err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE); - if (err != 0 || pattern->re_nsub != 1) { - fprintf(stderr, "Invalid pattern %s code %d\n", regex, err); - return false; - } - return true; -} - -static char **explode(char sep, const char *str, int *size) -{ - int count = 0, len = strlen(str); - int lastindex = -1, j = 0; - - for (int i = 0; i < len; i++) - if (str[i] == sep) - count++; - char **ret = calloc(++count, sizeof(char *)); - - for (int i = 0; i < len; i++) { - if (str[i] == sep) { - ret[j] = calloc(i - lastindex, sizeof(char)); - memcpy(ret[j++], str + lastindex + 1, i - lastindex - 1); - lastindex = i; - } - } - if (lastindex <= len - 1) { - ret[j] = calloc(len - lastindex, sizeof(char)); - memcpy(ret[j++], str + lastindex + 1, strlen(str) - 1 - lastindex); - } - *size = j; - return ret; -} - -static void free_explode(char **arr, int size) -{ - for (int i = 0; i < size; i++) - free(arr[i]); - free(arr); -} - -# define FIELD_BUFF 25 - -static int get_page_num(char *buf) -{ - int order_val; - char order_str[FIELD_BUFF] = {0}; - char *endptr; - - search_pattern(&order_pattern, order_str, buf); - errno = 0; - order_val = strtol(order_str, &endptr, 10); - if (order_val > 64 || errno != 0 || endptr == order_str || *endptr != '\0') { - if (debug_on) - fprintf(stderr, "wrong order in follow buf:\n%s\n", buf); - return 0; - } - - return 1 << order_val; -} - -static pid_t get_pid(char *buf) -{ - pid_t pid; - char pid_str[FIELD_BUFF] = {0}; - char *endptr; - - search_pattern(&pid_pattern, pid_str, buf); - errno = 0; - pid = strtol(pid_str, &endptr, 10); - if (errno != 0 || endptr == pid_str || *endptr != '\0') { - if (debug_on) - fprintf(stderr, "wrong/invalid pid in follow buf:\n%s\n", buf); - return -1; - } - - return pid; - -} - -static pid_t get_tgid(char *buf) -{ - pid_t tgid; - char tgid_str[FIELD_BUFF] = {0}; - char *endptr; - - search_pattern(&tgid_pattern, tgid_str, buf); - errno = 0; - tgid = strtol(tgid_str, &endptr, 10); - if (errno != 0 || endptr == tgid_str || *endptr != '\0') { - if (debug_on) - fprintf(stderr, "wrong/invalid tgid in follow buf:\n%s\n", buf); - return -1; - } - - return tgid; - -} - -static __u64 get_ts_nsec(char *buf) -{ - __u64 ts_nsec; - char ts_nsec_str[FIELD_BUFF] = {0}; - char *endptr; - - search_pattern(&ts_nsec_pattern, ts_nsec_str, buf); - errno = 0; - ts_nsec = strtoull(ts_nsec_str, &endptr, 10); - if (errno != 0 || endptr == ts_nsec_str || *endptr != '\0') { - if (debug_on) - fprintf(stderr, "wrong ts_nsec in follow buf:\n%s\n", buf); - return -1; - } - - return ts_nsec; -} - -static __u64 get_free_ts_nsec(char *buf) -{ - __u64 free_ts_nsec; - char free_ts_nsec_str[FIELD_BUFF] = {0}; - char *endptr; - - search_pattern(&free_ts_nsec_pattern, free_ts_nsec_str, buf); - errno = 0; - free_ts_nsec = strtoull(free_ts_nsec_str, &endptr, 10); - if (errno != 0 || endptr == free_ts_nsec_str || *endptr != '\0') { - if (debug_on) - fprintf(stderr, "wrong free_ts_nsec in follow buf:\n%s\n", buf); - return -1; - } - - return free_ts_nsec; -} - -static char *get_comm(char *buf) -{ - char *comm_str = malloc(TASK_COMM_LEN); - - memset(comm_str, 0, TASK_COMM_LEN); - - search_pattern(&comm_pattern, comm_str, buf); - errno = 0; - if (errno != 0) { - if (debug_on) - fprintf(stderr, "wrong comm in follow buf:\n%s\n", buf); - return NULL; - } - - return comm_str; -} - -static int get_arg_type(const char *arg) -{ - if (!strcmp(arg, "pid") || !strcmp(arg, "p")) - return ARG_PID; - else if (!strcmp(arg, "tgid") || !strcmp(arg, "tg")) - return ARG_TGID; - else if (!strcmp(arg, "name") || !strcmp(arg, "n")) - return ARG_COMM; - else if (!strcmp(arg, "stacktrace") || !strcmp(arg, "st")) - return ARG_STACKTRACE; - else if (!strcmp(arg, "free") || !strcmp(arg, "f")) - return ARG_FREE; - else if (!strcmp(arg, "txt") || !strcmp(arg, "T")) - return ARG_TXT; - else if (!strcmp(arg, "free_ts") || !strcmp(arg, "ft")) - return ARG_FREE_TS; - else if (!strcmp(arg, "alloc_ts") || !strcmp(arg, "at")) - return ARG_ALLOC_TS; - else if (!strcmp(arg, "allocator") || !strcmp(arg, "ator")) - return ARG_ALLOCATOR; - else { - return ARG_UNKNOWN; - } -} - -static int get_allocator(const char *buf, const char *migrate_info) -{ - char *tmp, *first_line, *second_line; - int allocator = 0; - - if (strstr(migrate_info, "CMA")) - allocator |= ALLOCATOR_CMA; - if (strstr(migrate_info, "slab")) - allocator |= ALLOCATOR_SLAB; - tmp = strstr(buf, "__vmalloc_node_range"); - if (tmp) { - second_line = tmp; - while (*tmp != '\n') - tmp--; - tmp--; - while (*tmp != '\n') - tmp--; - first_line = ++tmp; - tmp = strstr(tmp, "alloc_pages"); - if (tmp && first_line <= tmp && tmp < second_line) - allocator |= ALLOCATOR_VMALLOC; - } - if (allocator == 0) - allocator = ALLOCATOR_OTHERS; - return allocator; -} - -static bool match_num_list(int num, int *list, int list_size) -{ - for (int i = 0; i < list_size; ++i) - if (list[i] == num) - return true; - return false; -} - -static bool match_str_list(const char *str, char **list, int list_size) -{ - for (int i = 0; i < list_size; ++i) - if (!strcmp(list[i], str)) - return true; - return false; -} - -static bool is_need(char *buf) -{ - __u64 ts_nsec, free_ts_nsec; - - ts_nsec = get_ts_nsec(buf); - free_ts_nsec = get_free_ts_nsec(buf); - - if ((filter & FILTER_UNRELEASE) && free_ts_nsec != 0 && ts_nsec < free_ts_nsec) - return false; - if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size)) - return false; - if ((filter & FILTER_TGID) && - !match_num_list(get_tgid(buf), fc.tgids, fc.tgids_size)) - return false; - - char *comm = get_comm(buf); - - if ((filter & FILTER_COMM) && - !match_str_list(comm, fc.comms, fc.comms_size)) { - free(comm); - return false; - } - free(comm); - return true; -} - -static bool add_list(char *buf, int len, char *ext_buf) -{ - if (list_size != 0 && - len == list[list_size-1].len && - memcmp(buf, list[list_size-1].txt, len) == 0) { - list[list_size-1].num++; - list[list_size-1].page_num += get_page_num(buf); - return true; - } - if (list_size == max_size) { - fprintf(stderr, "max_size too small??\n"); - return false; - } - if (!is_need(buf)) - return true; - list[list_size].pid = get_pid(buf); - list[list_size].tgid = get_tgid(buf); - list[list_size].comm = get_comm(buf); - list[list_size].txt = malloc(len+1); - if (!list[list_size].txt) { - fprintf(stderr, "Out of memory\n"); - return false; - } - memcpy(list[list_size].txt, buf, len); - list[list_size].txt[len] = 0; - list[list_size].len = len; - list[list_size].num = 1; - list[list_size].page_num = get_page_num(buf); - - list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: ""; - if (*list[list_size].stacktrace == '\n') - list[list_size].stacktrace++; - list[list_size].ts_nsec = get_ts_nsec(buf); - list[list_size].free_ts_nsec = get_free_ts_nsec(buf); - list[list_size].allocator = get_allocator(buf, ext_buf); - list_size++; - if (list_size % 1000 == 0) { - printf("loaded %d\r", list_size); - fflush(stdout); - } - return true; -} - -static bool parse_cull_args(const char *arg_str) -{ - int size = 0; - char **args = explode(',', arg_str, &size); - - for (int i = 0; i < size; ++i) { - int arg_type = get_arg_type(args[i]); - - if (arg_type == ARG_PID) - cull |= CULL_PID; - else if (arg_type == ARG_TGID) - cull |= CULL_TGID; - else if (arg_type == ARG_COMM) - cull |= CULL_COMM; - else if (arg_type == ARG_STACKTRACE) - cull |= CULL_STACKTRACE; - else if (arg_type == ARG_FREE) - cull |= CULL_UNRELEASE; - else if (arg_type == ARG_ALLOCATOR) - cull |= CULL_ALLOCATOR; - else { - free_explode(args, size); - return false; - } - } - free_explode(args, size); - if (sc.size == 0) - set_single_cmp(compare_num, SORT_DESC); - return true; -} - -static void set_single_cmp(int (*cmp)(const void *, const void *), int sign) -{ - if (sc.signs == NULL || sc.size < 1) - sc.signs = calloc(1, sizeof(int)); - sc.signs[0] = sign; - if (sc.cmps == NULL || sc.size < 1) - sc.cmps = calloc(1, sizeof(int *)); - sc.cmps[0] = cmp; - sc.size = 1; -} - -static bool parse_sort_args(const char *arg_str) -{ - int size = 0; - - if (sc.size != 0) { /* reset sort_condition */ - free(sc.signs); - free(sc.cmps); - size = 0; - } - - char **args = explode(',', arg_str, &size); - - sc.signs = calloc(size, sizeof(int)); - sc.cmps = calloc(size, sizeof(int *)); - for (int i = 0; i < size; ++i) { - int offset = 0; - - sc.signs[i] = SORT_ASC; - if (args[i][0] == '-' || args[i][0] == '+') { - if (args[i][0] == '-') - sc.signs[i] = SORT_DESC; - offset = 1; - } - - int arg_type = get_arg_type(args[i]+offset); - - if (arg_type == ARG_PID) - sc.cmps[i] = compare_pid; - else if (arg_type == ARG_TGID) - sc.cmps[i] = compare_tgid; - else if (arg_type == ARG_COMM) - sc.cmps[i] = compare_comm; - else if (arg_type == ARG_STACKTRACE) - sc.cmps[i] = compare_stacktrace; - else if (arg_type == ARG_ALLOC_TS) - sc.cmps[i] = compare_ts; - else if (arg_type == ARG_FREE_TS) - sc.cmps[i] = compare_free_ts; - else if (arg_type == ARG_TXT) - sc.cmps[i] = compare_txt; - else if (arg_type == ARG_ALLOCATOR) - sc.cmps[i] = compare_allocator; - else { - free_explode(args, size); - sc.size = 0; - return false; - } - } - sc.size = size; - free_explode(args, size); - return true; -} - -static int *parse_nums_list(char *arg_str, int *list_size) -{ - int size = 0; - char **args = explode(',', arg_str, &size); - int *list = calloc(size, sizeof(int)); - - errno = 0; - for (int i = 0; i < size; ++i) { - char *endptr = NULL; - - list[i] = strtol(args[i], &endptr, 10); - if (errno != 0 || endptr == args[i] || *endptr != '\0') { - free(list); - return NULL; - } - } - *list_size = size; - free_explode(args, size); - return list; -} - -static void print_allocator(FILE *out, int allocator) -{ - fprintf(out, "allocated by "); - if (allocator & ALLOCATOR_CMA) - fprintf(out, "CMA "); - if (allocator & ALLOCATOR_SLAB) - fprintf(out, "SLAB "); - if (allocator & ALLOCATOR_VMALLOC) - fprintf(out, "VMALLOC "); - if (allocator & ALLOCATOR_OTHERS) - fprintf(out, "OTHERS "); -} - -#define BUF_SIZE (128 * 1024) - -static void usage(void) -{ - printf("Usage: ./page_owner_sort [OPTIONS] \n" - "-m\t\tSort by total memory.\n" - "-s\t\tSort by the stack trace.\n" - "-t\t\tSort by times (default).\n" - "-p\t\tSort by pid.\n" - "-P\t\tSort by tgid.\n" - "-n\t\tSort by task command name.\n" - "-a\t\tSort by memory allocate time.\n" - "-r\t\tSort by memory release time.\n" - "-f\t\tFilter out the information of blocks whose memory has been released.\n" - "-d\t\tPrint debug information.\n" - "--pid \tSelect by pid. This selects the information of blocks whose process ID numbers appear in .\n" - "--tgid \tSelect by tgid. This selects the information of blocks whose Thread Group ID numbers appear in .\n" - "--name \n\t\tSelect by command name. This selects the information of blocks whose command name appears in .\n" - "--cull \tCull by user-defined rules. is a single argument in the form of a comma-separated list with some common fields predefined\n" - "--sort \tSpecify sort order as: [+|-]key[,[+|-]key[,...]]\n" - ); -} - -int main(int argc, char **argv) -{ - FILE *fin, *fout; - char *buf, *ext_buf; - int i, count; - struct stat st; - int opt; - struct option longopts[] = { - { "pid", required_argument, NULL, 1 }, - { "tgid", required_argument, NULL, 2 }, - { "name", required_argument, NULL, 3 }, - { "cull", required_argument, NULL, 4 }, - { "sort", required_argument, NULL, 5 }, - { 0, 0, 0, 0}, - }; - - while ((opt = getopt_long(argc, argv, "adfmnprstP", longopts, NULL)) != -1) - switch (opt) { - case 'a': - set_single_cmp(compare_ts, SORT_ASC); - break; - case 'd': - debug_on = true; - break; - case 'f': - filter = filter | FILTER_UNRELEASE; - break; - case 'm': - set_single_cmp(compare_page_num, SORT_DESC); - break; - case 'p': - set_single_cmp(compare_pid, SORT_ASC); - break; - case 'r': - set_single_cmp(compare_free_ts, SORT_ASC); - break; - case 's': - set_single_cmp(compare_stacktrace, SORT_ASC); - break; - case 't': - set_single_cmp(compare_num, SORT_DESC); - break; - case 'P': - set_single_cmp(compare_tgid, SORT_ASC); - break; - case 'n': - set_single_cmp(compare_comm, SORT_ASC); - break; - case 1: - filter = filter | FILTER_PID; - fc.pids = parse_nums_list(optarg, &fc.pids_size); - if (fc.pids == NULL) { - fprintf(stderr, "wrong/invalid pid in from the command line:%s\n", - optarg); - exit(1); - } - break; - case 2: - filter = filter | FILTER_TGID; - fc.tgids = parse_nums_list(optarg, &fc.tgids_size); - if (fc.tgids == NULL) { - fprintf(stderr, "wrong/invalid tgid in from the command line:%s\n", - optarg); - exit(1); - } - break; - case 3: - filter = filter | FILTER_COMM; - fc.comms = explode(',', optarg, &fc.comms_size); - break; - case 4: - if (!parse_cull_args(optarg)) { - fprintf(stderr, "wrong argument after --cull option:%s\n", - optarg); - exit(1); - } - break; - case 5: - if (!parse_sort_args(optarg)) { - fprintf(stderr, "wrong argument after --sort option:%s\n", - optarg); - exit(1); - } - break; - default: - usage(); - exit(1); - } - - if (optind >= (argc - 1)) { - usage(); - exit(1); - } - - fin = fopen(argv[optind], "r"); - fout = fopen(argv[optind + 1], "w"); - if (!fin || !fout) { - usage(); - perror("open: "); - exit(1); - } - - if (!check_regcomp(&order_pattern, "order\\s*([0-9]*),")) - goto out_order; - if (!check_regcomp(&pid_pattern, "pid\\s*([0-9]*),")) - goto out_pid; - if (!check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) ")) - goto out_tgid; - if (!check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts")) - goto out_comm; - if (!check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,")) - goto out_ts; - if (!check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns")) - goto out_free_ts; - - fstat(fileno(fin), &st); - max_size = st.st_size / 100; /* hack ... */ - - list = malloc(max_size * sizeof(*list)); - buf = malloc(BUF_SIZE); - ext_buf = malloc(BUF_SIZE); - if (!list || !buf || !ext_buf) { - fprintf(stderr, "Out of memory\n"); - goto out_free; - } - - for ( ; ; ) { - int buf_len = read_block(buf, ext_buf, BUF_SIZE, fin); - - if (buf_len < 0) - break; - if (!add_list(buf, buf_len, ext_buf)) - goto out_free; - } - - printf("loaded %d\n", list_size); - - printf("sorting ....\n"); - - qsort(list, list_size, sizeof(list[0]), compare_cull_condition); - - printf("culling\n"); - - for (i = count = 0; i < list_size; i++) { - if (count == 0 || - compare_cull_condition((void *)(&list[count-1]), (void *)(&list[i])) != 0) { - list[count++] = list[i]; - } else { - list[count-1].num += list[i].num; - list[count-1].page_num += list[i].page_num; - } - } - - qsort(list, count, sizeof(list[0]), compare_sort_condition); - - for (i = 0; i < count; i++) { - if (cull == 0) { - fprintf(fout, "%d times, %d pages, ", list[i].num, list[i].page_num); - print_allocator(fout, list[i].allocator); - fprintf(fout, ":\n%s\n", list[i].txt); - } - else { - fprintf(fout, "%d times, %d pages", - list[i].num, list[i].page_num); - if (cull & CULL_PID || filter & FILTER_PID) - fprintf(fout, ", PID %d", list[i].pid); - if (cull & CULL_TGID || filter & FILTER_TGID) - fprintf(fout, ", TGID %d", list[i].pid); - if (cull & CULL_COMM || filter & FILTER_COMM) - fprintf(fout, ", task_comm_name: %s", list[i].comm); - if (cull & CULL_ALLOCATOR) { - fprintf(fout, ", "); - print_allocator(fout, list[i].allocator); - } - if (cull & CULL_UNRELEASE) - fprintf(fout, " (%s)", - list[i].free_ts_nsec ? "UNRELEASED" : "RELEASED"); - if (cull & CULL_STACKTRACE) - fprintf(fout, ":\n%s", list[i].stacktrace); - fprintf(fout, "\n"); - } - } - -out_free: - if (ext_buf) - free(ext_buf); - if (buf) - free(buf); - if (list) - free(list); -out_free_ts: - regfree(&free_ts_nsec_pattern); -out_ts: - regfree(&ts_nsec_pattern); -out_comm: - regfree(&comm_pattern); -out_tgid: - regfree(&tgid_pattern); -out_pid: - regfree(&pid_pattern); -out_order: - regfree(&order_pattern); - - return 0; -} diff --git a/tools/vm/slabinfo-gnuplot.sh b/tools/vm/slabinfo-gnuplot.sh deleted file mode 100644 index 873a892147e5..000000000000 --- a/tools/vm/slabinfo-gnuplot.sh +++ /dev/null @@ -1,268 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0-only - -# Sergey Senozhatsky, 2015 -# sergey.senozhatsky.work@gmail.com -# - - -# This program is intended to plot a `slabinfo -X' stats, collected, -# for example, using the following command: -# while [ 1 ]; do slabinfo -X >> stats; sleep 1; done -# -# Use `slabinfo-gnuplot.sh stats' to pre-process collected records -# and generate graphs (totals, slabs sorted by size, slabs sorted -# by size). -# -# Graphs can be [individually] regenerate with different ranges and -# size (-r %d,%d and -s %d,%d options). -# -# To visually compare N `totals' graphs, do -# slabinfo-gnuplot.sh -t FILE1-totals FILE2-totals ... FILEN-totals -# - -min_slab_name_size=11 -xmin=0 -xmax=0 -width=1500 -height=700 -mode=preprocess - -usage() -{ - echo "Usage: [-s W,H] [-r MIN,MAX] [-t|-l] FILE1 [FILE2 ..]" - echo "FILEs must contain 'slabinfo -X' samples" - echo "-t - plot totals for FILE(s)" - echo "-l - plot slabs stats for FILE(s)" - echo "-s %d,%d - set image width and height" - echo "-r %d,%d - use data samples from a given range" -} - -check_file_exist() -{ - if [ ! -f "$1" ]; then - echo "File '$1' does not exist" - exit 1 - fi -} - -do_slabs_plotting() -{ - local file=$1 - local out_file - local range="every ::$xmin" - local xtic="" - local xtic_rotate="norotate" - local lines=2000000 - local wc_lines - - check_file_exist "$file" - - out_file=`basename "$file"` - if [ $xmax -ne 0 ]; then - range="$range::$xmax" - lines=$((xmax-xmin)) - fi - - wc_lines=`cat "$file" | wc -l` - if [ $? -ne 0 ] || [ "$wc_lines" -eq 0 ] ; then - wc_lines=$lines - fi - - if [ "$wc_lines" -lt "$lines" ]; then - lines=$wc_lines - fi - - if [ $((width / lines)) -gt $min_slab_name_size ]; then - xtic=":xtic(1)" - xtic_rotate=90 - fi - -gnuplot -p << EOF -#!/usr/bin/env gnuplot - -set terminal png enhanced size $width,$height large -set output '$out_file.png' -set autoscale xy -set xlabel 'samples' -set ylabel 'bytes' -set style histogram columnstacked title textcolor lt -1 -set style fill solid 0.15 -set xtics rotate $xtic_rotate -set key left above Left title reverse - -plot "$file" $range u 2$xtic title 'SIZE' with boxes,\ - '' $range u 3 title 'LOSS' with boxes -EOF - - if [ $? -eq 0 ]; then - echo "$out_file.png" - fi -} - -do_totals_plotting() -{ - local gnuplot_cmd="" - local range="every ::$xmin" - local file="" - - if [ $xmax -ne 0 ]; then - range="$range::$xmax" - fi - - for i in "${t_files[@]}"; do - check_file_exist "$i" - - file="$file"`basename "$i"` - gnuplot_cmd="$gnuplot_cmd '$i' $range using 1 title\ - '$i Memory usage' with lines," - gnuplot_cmd="$gnuplot_cmd '' $range using 2 title \ - '$i Loss' with lines," - done - -gnuplot -p << EOF -#!/usr/bin/env gnuplot - -set terminal png enhanced size $width,$height large -set autoscale xy -set output '$file.png' -set xlabel 'samples' -set ylabel 'bytes' -set key left above Left title reverse - -plot $gnuplot_cmd -EOF - - if [ $? -eq 0 ]; then - echo "$file.png" - fi -} - -do_preprocess() -{ - local out - local lines - local in=$1 - - check_file_exist "$in" - - # use only 'TOP' slab (biggest memory usage or loss) - let lines=3 - out=`basename "$in"`"-slabs-by-loss" - `cat "$in" | grep -A "$lines" 'Slabs sorted by loss' |\ - grep -E -iv '\-\-|Name|Slabs'\ - | awk '{print $1" "$4+$2*$3" "$4}' > "$out"` - if [ $? -eq 0 ]; then - do_slabs_plotting "$out" - fi - - let lines=3 - out=`basename "$in"`"-slabs-by-size" - `cat "$in" | grep -A "$lines" 'Slabs sorted by size' |\ - grep -E -iv '\-\-|Name|Slabs'\ - | awk '{print $1" "$4" "$4-$2*$3}' > "$out"` - if [ $? -eq 0 ]; then - do_slabs_plotting "$out" - fi - - out=`basename "$in"`"-totals" - `cat "$in" | grep "Memory used" |\ - awk '{print $3" "$7}' > "$out"` - if [ $? -eq 0 ]; then - t_files[0]=$out - do_totals_plotting - fi -} - -parse_opts() -{ - local opt - - while getopts "tlr::s::h" opt; do - case $opt in - t) - mode=totals - ;; - l) - mode=slabs - ;; - s) - array=(${OPTARG//,/ }) - width=${array[0]} - height=${array[1]} - ;; - r) - array=(${OPTARG//,/ }) - xmin=${array[0]} - xmax=${array[1]} - ;; - h) - usage - exit 0 - ;; - \?) - echo "Invalid option: -$OPTARG" >&2 - exit 1 - ;; - :) - echo "-$OPTARG requires an argument." >&2 - exit 1 - ;; - esac - done - - return $OPTIND -} - -parse_args() -{ - local idx=0 - local p - - for p in "$@"; do - case $mode in - preprocess) - files[$idx]=$p - idx=$idx+1 - ;; - totals) - t_files[$idx]=$p - idx=$idx+1 - ;; - slabs) - files[$idx]=$p - idx=$idx+1 - ;; - esac - done -} - -parse_opts "$@" -argstart=$? -parse_args "${@:$argstart}" - -if [ ${#files[@]} -eq 0 ] && [ ${#t_files[@]} -eq 0 ]; then - usage - exit 1 -fi - -case $mode in - preprocess) - for i in "${files[@]}"; do - do_preprocess "$i" - done - ;; - totals) - do_totals_plotting - ;; - slabs) - for i in "${files[@]}"; do - do_slabs_plotting "$i" - done - ;; - *) - echo "Unknown mode $mode" >&2 - usage - exit 1 - ;; -esac diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c deleted file mode 100644 index cfaeaea71042..000000000000 --- a/tools/vm/slabinfo.c +++ /dev/null @@ -1,1544 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Slabinfo: Tool to get reports about slabs - * - * (C) 2007 sgi, Christoph Lameter - * (C) 2011 Linux Foundation, Christoph Lameter - * - * Compile with: - * - * gcc -o slabinfo slabinfo.c - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MAX_SLABS 500 -#define MAX_ALIASES 500 -#define MAX_NODES 1024 - -struct slabinfo { - char *name; - int alias; - int refs; - int aliases, align, cache_dma, cpu_slabs, destroy_by_rcu; - unsigned int hwcache_align, object_size, objs_per_slab; - unsigned int sanity_checks, slab_size, store_user, trace; - int order, poison, reclaim_account, red_zone; - unsigned long partial, objects, slabs, objects_partial, objects_total; - unsigned long alloc_fastpath, alloc_slowpath; - unsigned long free_fastpath, free_slowpath; - unsigned long free_frozen, free_add_partial, free_remove_partial; - unsigned long alloc_from_partial, alloc_slab, free_slab, alloc_refill; - unsigned long cpuslab_flush, deactivate_full, deactivate_empty; - unsigned long deactivate_to_head, deactivate_to_tail; - unsigned long deactivate_remote_frees, order_fallback; - unsigned long cmpxchg_double_cpu_fail, cmpxchg_double_fail; - unsigned long alloc_node_mismatch, deactivate_bypass; - unsigned long cpu_partial_alloc, cpu_partial_free; - int numa[MAX_NODES]; - int numa_partial[MAX_NODES]; -} slabinfo[MAX_SLABS]; - -struct aliasinfo { - char *name; - char *ref; - struct slabinfo *slab; -} aliasinfo[MAX_ALIASES]; - -int slabs; -int actual_slabs; -int aliases; -int alias_targets; -int highest_node; - -char buffer[4096]; - -int show_empty; -int show_report; -int show_alias; -int show_slab; -int skip_zero = 1; -int show_numa; -int show_track; -int show_first_alias; -int validate; -int shrink; -int show_inverted; -int show_single_ref; -int show_totals; -int sort_size; -int sort_active; -int set_debug; -int show_ops; -int sort_partial; -int show_activity; -int output_lines = -1; -int sort_loss; -int extended_totals; -int show_bytes; -int unreclaim_only; - -/* Debug options */ -int sanity; -int redzone; -int poison; -int tracking; -int tracing; - -int page_size; - -regex_t pattern; - -static void fatal(const char *x, ...) -{ - va_list ap; - - va_start(ap, x); - vfprintf(stderr, x, ap); - va_end(ap); - exit(EXIT_FAILURE); -} - -static void usage(void) -{ - printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n" - "slabinfo [-aABDefhilLnoPrsStTUvXz1] [N=K] [-dafzput] [slab-regexp]\n" - "-a|--aliases Show aliases\n" - "-A|--activity Most active slabs first\n" - "-B|--Bytes Show size in bytes\n" - "-D|--display-active Switch line format to activity\n" - "-e|--empty Show empty slabs\n" - "-f|--first-alias Show first alias\n" - "-h|--help Show usage information\n" - "-i|--inverted Inverted list\n" - "-l|--slabs Show slabs\n" - "-L|--Loss Sort by loss\n" - "-n|--numa Show NUMA information\n" - "-N|--lines=K Show the first K slabs\n" - "-o|--ops Show kmem_cache_ops\n" - "-P|--partial Sort by number of partial slabs\n" - "-r|--report Detailed report on single slabs\n" - "-s|--shrink Shrink slabs\n" - "-S|--Size Sort by size\n" - "-t|--tracking Show alloc/free information\n" - "-T|--Totals Show summary information\n" - "-U|--Unreclaim Show unreclaimable slabs only\n" - "-v|--validate Validate slabs\n" - "-X|--Xtotals Show extended summary information\n" - "-z|--zero Include empty slabs\n" - "-1|--1ref Single reference\n" - - "\n" - "-d | --debug Switch off all debug options\n" - "-da | --debug=a Switch on all debug options (--debug=FZPU)\n" - - "\n" - "-d[afzput] | --debug=[afzput]\n" - " f | F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n" - " z | Z Redzoning\n" - " p | P Poisoning\n" - " u | U Tracking\n" - " t | T Tracing\n" - - "\nSorting options (--Loss, --Size, --Partial) are mutually exclusive\n" - ); -} - -static unsigned long read_obj(const char *name) -{ - FILE *f = fopen(name, "r"); - - if (!f) { - buffer[0] = 0; - if (errno == EACCES) - fatal("%s, Try using superuser\n", strerror(errno)); - } else { - if (!fgets(buffer, sizeof(buffer), f)) - buffer[0] = 0; - fclose(f); - if (buffer[strlen(buffer)] == '\n') - buffer[strlen(buffer)] = 0; - } - return strlen(buffer); -} - - -/* - * Get the contents of an attribute - */ -static unsigned long get_obj(const char *name) -{ - if (!read_obj(name)) - return 0; - - return atol(buffer); -} - -static unsigned long get_obj_and_str(const char *name, char **x) -{ - unsigned long result = 0; - char *p; - - *x = NULL; - - if (!read_obj(name)) { - x = NULL; - return 0; - } - result = strtoul(buffer, &p, 10); - while (*p == ' ') - p++; - if (*p) - *x = strdup(p); - return result; -} - -static void set_obj(struct slabinfo *s, const char *name, int n) -{ - char x[100]; - FILE *f; - - snprintf(x, 100, "%s/%s", s->name, name); - f = fopen(x, "w"); - if (!f) - fatal("Cannot write to %s\n", x); - - fprintf(f, "%d\n", n); - fclose(f); -} - -static unsigned long read_slab_obj(struct slabinfo *s, const char *name) -{ - char x[100]; - FILE *f; - size_t l; - - snprintf(x, 100, "%s/%s", s->name, name); - f = fopen(x, "r"); - if (!f) { - buffer[0] = 0; - l = 0; - } else { - l = fread(buffer, 1, sizeof(buffer), f); - buffer[l] = 0; - fclose(f); - } - return l; -} - -static unsigned long read_debug_slab_obj(struct slabinfo *s, const char *name) -{ - char x[128]; - FILE *f; - size_t l; - - snprintf(x, 128, "/sys/kernel/debug/slab/%s/%s", s->name, name); - f = fopen(x, "r"); - if (!f) { - buffer[0] = 0; - l = 0; - } else { - l = fread(buffer, 1, sizeof(buffer), f); - buffer[l] = 0; - fclose(f); - } - return l; -} - -/* - * Put a size string together - */ -static int store_size(char *buffer, unsigned long value) -{ - unsigned long divisor = 1; - char trailer = 0; - int n; - - if (!show_bytes) { - if (value > 1000000000UL) { - divisor = 100000000UL; - trailer = 'G'; - } else if (value > 1000000UL) { - divisor = 100000UL; - trailer = 'M'; - } else if (value > 1000UL) { - divisor = 100; - trailer = 'K'; - } - } - - value /= divisor; - n = sprintf(buffer, "%ld",value); - if (trailer) { - buffer[n] = trailer; - n++; - buffer[n] = 0; - } - if (divisor != 1) { - memmove(buffer + n - 2, buffer + n - 3, 4); - buffer[n-2] = '.'; - n++; - } - return n; -} - -static void decode_numa_list(int *numa, char *t) -{ - int node; - int nr; - - memset(numa, 0, MAX_NODES * sizeof(int)); - - if (!t) - return; - - while (*t == 'N') { - t++; - node = strtoul(t, &t, 10); - if (*t == '=') { - t++; - nr = strtoul(t, &t, 10); - numa[node] = nr; - if (node > highest_node) - highest_node = node; - } - while (*t == ' ') - t++; - } -} - -static void slab_validate(struct slabinfo *s) -{ - if (strcmp(s->name, "*") == 0) - return; - - set_obj(s, "validate", 1); -} - -static void slab_shrink(struct slabinfo *s) -{ - if (strcmp(s->name, "*") == 0) - return; - - set_obj(s, "shrink", 1); -} - -int line = 0; - -static void first_line(void) -{ - if (show_activity) - printf("Name Objects Alloc Free" - " %%Fast Fallb O CmpX UL\n"); - else - printf("Name Objects Objsize %s " - "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n", - sort_loss ? " Loss" : "Space"); -} - -/* - * Find the shortest alias of a slab - */ -static struct aliasinfo *find_one_alias(struct slabinfo *find) -{ - struct aliasinfo *a; - struct aliasinfo *best = NULL; - - for(a = aliasinfo;a < aliasinfo + aliases; a++) { - if (a->slab == find && - (!best || strlen(best->name) < strlen(a->name))) { - best = a; - if (strncmp(a->name,"kmall", 5) == 0) - return best; - } - } - return best; -} - -static unsigned long slab_size(struct slabinfo *s) -{ - return s->slabs * (page_size << s->order); -} - -static unsigned long slab_activity(struct slabinfo *s) -{ - return s->alloc_fastpath + s->free_fastpath + - s->alloc_slowpath + s->free_slowpath; -} - -static unsigned long slab_waste(struct slabinfo *s) -{ - return slab_size(s) - s->objects * s->object_size; -} - -static void slab_numa(struct slabinfo *s, int mode) -{ - int node; - - if (strcmp(s->name, "*") == 0) - return; - - if (!highest_node) { - printf("\n%s: No NUMA information available.\n", s->name); - return; - } - - if (skip_zero && !s->slabs) - return; - - if (!line) { - printf("\n%-21s:", mode ? "NUMA nodes" : "Slab"); - for(node = 0; node <= highest_node; node++) - printf(" %4d", node); - printf("\n----------------------"); - for(node = 0; node <= highest_node; node++) - printf("-----"); - printf("\n"); - } - printf("%-21s ", mode ? "All slabs" : s->name); - for(node = 0; node <= highest_node; node++) { - char b[20]; - - store_size(b, s->numa[node]); - printf(" %4s", b); - } - printf("\n"); - if (mode) { - printf("%-21s ", "Partial slabs"); - for(node = 0; node <= highest_node; node++) { - char b[20]; - - store_size(b, s->numa_partial[node]); - printf(" %4s", b); - } - printf("\n"); - } - line++; -} - -static void show_tracking(struct slabinfo *s) -{ - printf("\n%s: Kernel object allocation\n", s->name); - printf("-----------------------------------------------------------------------\n"); - if (read_debug_slab_obj(s, "alloc_traces")) - printf("%s", buffer); - else if (read_slab_obj(s, "alloc_calls")) - printf("%s", buffer); - else - printf("No Data\n"); - - printf("\n%s: Kernel object freeing\n", s->name); - printf("------------------------------------------------------------------------\n"); - if (read_debug_slab_obj(s, "free_traces")) - printf("%s", buffer); - else if (read_slab_obj(s, "free_calls")) - printf("%s", buffer); - else - printf("No Data\n"); - -} - -static void ops(struct slabinfo *s) -{ - if (strcmp(s->name, "*") == 0) - return; - - if (read_slab_obj(s, "ops")) { - printf("\n%s: kmem_cache operations\n", s->name); - printf("--------------------------------------------\n"); - printf("%s", buffer); - } else - printf("\n%s has no kmem_cache operations\n", s->name); -} - -static const char *onoff(int x) -{ - if (x) - return "On "; - return "Off"; -} - -static void slab_stats(struct slabinfo *s) -{ - unsigned long total_alloc; - unsigned long total_free; - unsigned long total; - - if (!s->alloc_slab) - return; - - total_alloc = s->alloc_fastpath + s->alloc_slowpath; - total_free = s->free_fastpath + s->free_slowpath; - - if (!total_alloc) - return; - - printf("\n"); - printf("Slab Perf Counter Alloc Free %%Al %%Fr\n"); - printf("--------------------------------------------------\n"); - printf("Fastpath %8lu %8lu %3lu %3lu\n", - s->alloc_fastpath, s->free_fastpath, - s->alloc_fastpath * 100 / total_alloc, - total_free ? s->free_fastpath * 100 / total_free : 0); - printf("Slowpath %8lu %8lu %3lu %3lu\n", - total_alloc - s->alloc_fastpath, s->free_slowpath, - (total_alloc - s->alloc_fastpath) * 100 / total_alloc, - total_free ? s->free_slowpath * 100 / total_free : 0); - printf("Page Alloc %8lu %8lu %3lu %3lu\n", - s->alloc_slab, s->free_slab, - s->alloc_slab * 100 / total_alloc, - total_free ? s->free_slab * 100 / total_free : 0); - printf("Add partial %8lu %8lu %3lu %3lu\n", - s->deactivate_to_head + s->deactivate_to_tail, - s->free_add_partial, - (s->deactivate_to_head + s->deactivate_to_tail) * 100 / total_alloc, - total_free ? s->free_add_partial * 100 / total_free : 0); - printf("Remove partial %8lu %8lu %3lu %3lu\n", - s->alloc_from_partial, s->free_remove_partial, - s->alloc_from_partial * 100 / total_alloc, - total_free ? s->free_remove_partial * 100 / total_free : 0); - - printf("Cpu partial list %8lu %8lu %3lu %3lu\n", - s->cpu_partial_alloc, s->cpu_partial_free, - s->cpu_partial_alloc * 100 / total_alloc, - total_free ? s->cpu_partial_free * 100 / total_free : 0); - - printf("RemoteObj/SlabFrozen %8lu %8lu %3lu %3lu\n", - s->deactivate_remote_frees, s->free_frozen, - s->deactivate_remote_frees * 100 / total_alloc, - total_free ? s->free_frozen * 100 / total_free : 0); - - printf("Total %8lu %8lu\n\n", total_alloc, total_free); - - if (s->cpuslab_flush) - printf("Flushes %8lu\n", s->cpuslab_flush); - - total = s->deactivate_full + s->deactivate_empty + - s->deactivate_to_head + s->deactivate_to_tail + s->deactivate_bypass; - - if (total) { - printf("\nSlab Deactivation Occurrences %%\n"); - printf("-------------------------------------------------\n"); - printf("Slab full %7lu %3lu%%\n", - s->deactivate_full, (s->deactivate_full * 100) / total); - printf("Slab empty %7lu %3lu%%\n", - s->deactivate_empty, (s->deactivate_empty * 100) / total); - printf("Moved to head of partial list %7lu %3lu%%\n", - s->deactivate_to_head, (s->deactivate_to_head * 100) / total); - printf("Moved to tail of partial list %7lu %3lu%%\n", - s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total); - printf("Deactivation bypass %7lu %3lu%%\n", - s->deactivate_bypass, (s->deactivate_bypass * 100) / total); - printf("Refilled from foreign frees %7lu %3lu%%\n", - s->alloc_refill, (s->alloc_refill * 100) / total); - printf("Node mismatch %7lu %3lu%%\n", - s->alloc_node_mismatch, (s->alloc_node_mismatch * 100) / total); - } - - if (s->cmpxchg_double_fail || s->cmpxchg_double_cpu_fail) { - printf("\nCmpxchg_double Looping\n------------------------\n"); - printf("Locked Cmpxchg Double redos %lu\nUnlocked Cmpxchg Double redos %lu\n", - s->cmpxchg_double_fail, s->cmpxchg_double_cpu_fail); - } -} - -static void report(struct slabinfo *s) -{ - if (strcmp(s->name, "*") == 0) - return; - - printf("\nSlabcache: %-15s Aliases: %2d Order : %2d Objects: %lu\n", - s->name, s->aliases, s->order, s->objects); - if (s->hwcache_align) - printf("** Hardware cacheline aligned\n"); - if (s->cache_dma) - printf("** Memory is allocated in a special DMA zone\n"); - if (s->destroy_by_rcu) - printf("** Slabs are destroyed via RCU\n"); - if (s->reclaim_account) - printf("** Reclaim accounting active\n"); - - printf("\nSizes (bytes) Slabs Debug Memory\n"); - printf("------------------------------------------------------------------------\n"); - printf("Object : %7d Total : %7ld Sanity Checks : %s Total: %7ld\n", - s->object_size, s->slabs, onoff(s->sanity_checks), - s->slabs * (page_size << s->order)); - printf("SlabObj: %7d Full : %7ld Redzoning : %s Used : %7ld\n", - s->slab_size, s->slabs - s->partial - s->cpu_slabs, - onoff(s->red_zone), s->objects * s->object_size); - printf("SlabSiz: %7d Partial: %7ld Poisoning : %s Loss : %7ld\n", - page_size << s->order, s->partial, onoff(s->poison), - s->slabs * (page_size << s->order) - s->objects * s->object_size); - printf("Loss : %7d CpuSlab: %7d Tracking : %s Lalig: %7ld\n", - s->slab_size - s->object_size, s->cpu_slabs, onoff(s->store_user), - (s->slab_size - s->object_size) * s->objects); - printf("Align : %7d Objects: %7d Tracing : %s Lpadd: %7ld\n", - s->align, s->objs_per_slab, onoff(s->trace), - ((page_size << s->order) - s->objs_per_slab * s->slab_size) * - s->slabs); - - ops(s); - show_tracking(s); - slab_numa(s, 1); - slab_stats(s); -} - -static void slabcache(struct slabinfo *s) -{ - char size_str[20]; - char dist_str[40]; - char flags[20]; - char *p = flags; - - if (strcmp(s->name, "*") == 0) - return; - - if (unreclaim_only && s->reclaim_account) - return; - - if (actual_slabs == 1) { - report(s); - return; - } - - if (skip_zero && !show_empty && !s->slabs) - return; - - if (show_empty && s->slabs) - return; - - if (sort_loss == 0) - store_size(size_str, slab_size(s)); - else - store_size(size_str, slab_waste(s)); - snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs, - s->partial, s->cpu_slabs); - - if (!line++) - first_line(); - - if (s->aliases) - *p++ = '*'; - if (s->cache_dma) - *p++ = 'd'; - if (s->hwcache_align) - *p++ = 'A'; - if (s->poison) - *p++ = 'P'; - if (s->reclaim_account) - *p++ = 'a'; - if (s->red_zone) - *p++ = 'Z'; - if (s->sanity_checks) - *p++ = 'F'; - if (s->store_user) - *p++ = 'U'; - if (s->trace) - *p++ = 'T'; - - *p = 0; - if (show_activity) { - unsigned long total_alloc; - unsigned long total_free; - - total_alloc = s->alloc_fastpath + s->alloc_slowpath; - total_free = s->free_fastpath + s->free_slowpath; - - printf("%-21s %8ld %10ld %10ld %3ld %3ld %5ld %1d %4ld %4ld\n", - s->name, s->objects, - total_alloc, total_free, - total_alloc ? (s->alloc_fastpath * 100 / total_alloc) : 0, - total_free ? (s->free_fastpath * 100 / total_free) : 0, - s->order_fallback, s->order, s->cmpxchg_double_fail, - s->cmpxchg_double_cpu_fail); - } else { - printf("%-21s %8ld %7d %15s %14s %4d %1d %3ld %3ld %s\n", - s->name, s->objects, s->object_size, size_str, dist_str, - s->objs_per_slab, s->order, - s->slabs ? (s->partial * 100) / s->slabs : 100, - s->slabs ? (s->objects * s->object_size * 100) / - (s->slabs * (page_size << s->order)) : 100, - flags); - } -} - -/* - * Analyze debug options. Return false if something is amiss. - */ -static int debug_opt_scan(char *opt) -{ - if (!opt || !opt[0] || strcmp(opt, "-") == 0) - return 1; - - if (strcasecmp(opt, "a") == 0) { - sanity = 1; - poison = 1; - redzone = 1; - tracking = 1; - return 1; - } - - for ( ; *opt; opt++) - switch (*opt) { - case 'F' : case 'f': - if (sanity) - return 0; - sanity = 1; - break; - case 'P' : case 'p': - if (poison) - return 0; - poison = 1; - break; - - case 'Z' : case 'z': - if (redzone) - return 0; - redzone = 1; - break; - - case 'U' : case 'u': - if (tracking) - return 0; - tracking = 1; - break; - - case 'T' : case 't': - if (tracing) - return 0; - tracing = 1; - break; - default: - return 0; - } - return 1; -} - -static int slab_empty(struct slabinfo *s) -{ - if (s->objects > 0) - return 0; - - /* - * We may still have slabs even if there are no objects. Shrinking will - * remove them. - */ - if (s->slabs != 0) - set_obj(s, "shrink", 1); - - return 1; -} - -static void slab_debug(struct slabinfo *s) -{ - if (strcmp(s->name, "*") == 0) - return; - - if (sanity && !s->sanity_checks) { - set_obj(s, "sanity_checks", 1); - } - if (!sanity && s->sanity_checks) { - if (slab_empty(s)) - set_obj(s, "sanity_checks", 0); - else - fprintf(stderr, "%s not empty cannot disable sanity checks\n", s->name); - } - if (redzone && !s->red_zone) { - if (slab_empty(s)) - set_obj(s, "red_zone", 1); - else - fprintf(stderr, "%s not empty cannot enable redzoning\n", s->name); - } - if (!redzone && s->red_zone) { - if (slab_empty(s)) - set_obj(s, "red_zone", 0); - else - fprintf(stderr, "%s not empty cannot disable redzoning\n", s->name); - } - if (poison && !s->poison) { - if (slab_empty(s)) - set_obj(s, "poison", 1); - else - fprintf(stderr, "%s not empty cannot enable poisoning\n", s->name); - } - if (!poison && s->poison) { - if (slab_empty(s)) - set_obj(s, "poison", 0); - else - fprintf(stderr, "%s not empty cannot disable poisoning\n", s->name); - } - if (tracking && !s->store_user) { - if (slab_empty(s)) - set_obj(s, "store_user", 1); - else - fprintf(stderr, "%s not empty cannot enable tracking\n", s->name); - } - if (!tracking && s->store_user) { - if (slab_empty(s)) - set_obj(s, "store_user", 0); - else - fprintf(stderr, "%s not empty cannot disable tracking\n", s->name); - } - if (tracing && !s->trace) { - if (slabs == 1) - set_obj(s, "trace", 1); - else - fprintf(stderr, "%s can only enable trace for one slab at a time\n", s->name); - } - if (!tracing && s->trace) - set_obj(s, "trace", 1); -} - -static void totals(void) -{ - struct slabinfo *s; - - int used_slabs = 0; - char b1[20], b2[20], b3[20], b4[20]; - unsigned long long max = 1ULL << 63; - - /* Object size */ - unsigned long long min_objsize = max, max_objsize = 0, avg_objsize; - - /* Number of partial slabs in a slabcache */ - unsigned long long min_partial = max, max_partial = 0, - avg_partial, total_partial = 0; - - /* Number of slabs in a slab cache */ - unsigned long long min_slabs = max, max_slabs = 0, - avg_slabs, total_slabs = 0; - - /* Size of the whole slab */ - unsigned long long min_size = max, max_size = 0, - avg_size, total_size = 0; - - /* Bytes used for object storage in a slab */ - unsigned long long min_used = max, max_used = 0, - avg_used, total_used = 0; - - /* Waste: Bytes used for alignment and padding */ - unsigned long long min_waste = max, max_waste = 0, - avg_waste, total_waste = 0; - /* Number of objects in a slab */ - unsigned long long min_objects = max, max_objects = 0, - avg_objects, total_objects = 0; - /* Waste per object */ - unsigned long long min_objwaste = max, - max_objwaste = 0, avg_objwaste, - total_objwaste = 0; - - /* Memory per object */ - unsigned long long min_memobj = max, - max_memobj = 0, avg_memobj, - total_objsize = 0; - - /* Percentage of partial slabs per slab */ - unsigned long min_ppart = 100, max_ppart = 0, - avg_ppart, total_ppart = 0; - - /* Number of objects in partial slabs */ - unsigned long min_partobj = max, max_partobj = 0, - avg_partobj, total_partobj = 0; - - /* Percentage of partial objects of all objects in a slab */ - unsigned long min_ppartobj = 100, max_ppartobj = 0, - avg_ppartobj, total_ppartobj = 0; - - - for (s = slabinfo; s < slabinfo + slabs; s++) { - unsigned long long size; - unsigned long used; - unsigned long long wasted; - unsigned long long objwaste; - unsigned long percentage_partial_slabs; - unsigned long percentage_partial_objs; - - if (!s->slabs || !s->objects) - continue; - - used_slabs++; - - size = slab_size(s); - used = s->objects * s->object_size; - wasted = size - used; - objwaste = s->slab_size - s->object_size; - - percentage_partial_slabs = s->partial * 100 / s->slabs; - if (percentage_partial_slabs > 100) - percentage_partial_slabs = 100; - - percentage_partial_objs = s->objects_partial * 100 - / s->objects; - - if (percentage_partial_objs > 100) - percentage_partial_objs = 100; - - if (s->object_size < min_objsize) - min_objsize = s->object_size; - if (s->partial < min_partial) - min_partial = s->partial; - if (s->slabs < min_slabs) - min_slabs = s->slabs; - if (size < min_size) - min_size = size; - if (wasted < min_waste) - min_waste = wasted; - if (objwaste < min_objwaste) - min_objwaste = objwaste; - if (s->objects < min_objects) - min_objects = s->objects; - if (used < min_used) - min_used = used; - if (s->objects_partial < min_partobj) - min_partobj = s->objects_partial; - if (percentage_partial_slabs < min_ppart) - min_ppart = percentage_partial_slabs; - if (percentage_partial_objs < min_ppartobj) - min_ppartobj = percentage_partial_objs; - if (s->slab_size < min_memobj) - min_memobj = s->slab_size; - - if (s->object_size > max_objsize) - max_objsize = s->object_size; - if (s->partial > max_partial) - max_partial = s->partial; - if (s->slabs > max_slabs) - max_slabs = s->slabs; - if (size > max_size) - max_size = size; - if (wasted > max_waste) - max_waste = wasted; - if (objwaste > max_objwaste) - max_objwaste = objwaste; - if (s->objects > max_objects) - max_objects = s->objects; - if (used > max_used) - max_used = used; - if (s->objects_partial > max_partobj) - max_partobj = s->objects_partial; - if (percentage_partial_slabs > max_ppart) - max_ppart = percentage_partial_slabs; - if (percentage_partial_objs > max_ppartobj) - max_ppartobj = percentage_partial_objs; - if (s->slab_size > max_memobj) - max_memobj = s->slab_size; - - total_partial += s->partial; - total_slabs += s->slabs; - total_size += size; - total_waste += wasted; - - total_objects += s->objects; - total_used += used; - total_partobj += s->objects_partial; - total_ppart += percentage_partial_slabs; - total_ppartobj += percentage_partial_objs; - - total_objwaste += s->objects * objwaste; - total_objsize += s->objects * s->slab_size; - } - - if (!total_objects) { - printf("No objects\n"); - return; - } - if (!used_slabs) { - printf("No slabs\n"); - return; - } - - /* Per slab averages */ - avg_partial = total_partial / used_slabs; - avg_slabs = total_slabs / used_slabs; - avg_size = total_size / used_slabs; - avg_waste = total_waste / used_slabs; - - avg_objects = total_objects / used_slabs; - avg_used = total_used / used_slabs; - avg_partobj = total_partobj / used_slabs; - avg_ppart = total_ppart / used_slabs; - avg_ppartobj = total_ppartobj / used_slabs; - - /* Per object object sizes */ - avg_objsize = total_used / total_objects; - avg_objwaste = total_objwaste / total_objects; - avg_partobj = total_partobj * 100 / total_objects; - avg_memobj = total_objsize / total_objects; - - printf("Slabcache Totals\n"); - printf("----------------\n"); - printf("Slabcaches : %15d Aliases : %11d->%-3d Active: %3d\n", - slabs, aliases, alias_targets, used_slabs); - - store_size(b1, total_size);store_size(b2, total_waste); - store_size(b3, total_waste * 100 / total_used); - printf("Memory used: %15s # Loss : %15s MRatio:%6s%%\n", b1, b2, b3); - - store_size(b1, total_objects);store_size(b2, total_partobj); - store_size(b3, total_partobj * 100 / total_objects); - printf("# Objects : %15s # PartObj: %15s ORatio:%6s%%\n", b1, b2, b3); - - printf("\n"); - printf("Per Cache Average " - "Min Max Total\n"); - printf("---------------------------------------" - "-------------------------------------\n"); - - store_size(b1, avg_objects);store_size(b2, min_objects); - store_size(b3, max_objects);store_size(b4, total_objects); - printf("#Objects %15s %15s %15s %15s\n", - b1, b2, b3, b4); - - store_size(b1, avg_slabs);store_size(b2, min_slabs); - store_size(b3, max_slabs);store_size(b4, total_slabs); - printf("#Slabs %15s %15s %15s %15s\n", - b1, b2, b3, b4); - - store_size(b1, avg_partial);store_size(b2, min_partial); - store_size(b3, max_partial);store_size(b4, total_partial); - printf("#PartSlab %15s %15s %15s %15s\n", - b1, b2, b3, b4); - store_size(b1, avg_ppart);store_size(b2, min_ppart); - store_size(b3, max_ppart); - store_size(b4, total_partial * 100 / total_slabs); - printf("%%PartSlab%15s%% %15s%% %15s%% %15s%%\n", - b1, b2, b3, b4); - - store_size(b1, avg_partobj);store_size(b2, min_partobj); - store_size(b3, max_partobj); - store_size(b4, total_partobj); - printf("PartObjs %15s %15s %15s %15s\n", - b1, b2, b3, b4); - - store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj); - store_size(b3, max_ppartobj); - store_size(b4, total_partobj * 100 / total_objects); - printf("%% PartObj%15s%% %15s%% %15s%% %15s%%\n", - b1, b2, b3, b4); - - store_size(b1, avg_size);store_size(b2, min_size); - store_size(b3, max_size);store_size(b4, total_size); - printf("Memory %15s %15s %15s %15s\n", - b1, b2, b3, b4); - - store_size(b1, avg_used);store_size(b2, min_used); - store_size(b3, max_used);store_size(b4, total_used); - printf("Used %15s %15s %15s %15s\n", - b1, b2, b3, b4); - - store_size(b1, avg_waste);store_size(b2, min_waste); - store_size(b3, max_waste);store_size(b4, total_waste); - printf("Loss %15s %15s %15s %15s\n", - b1, b2, b3, b4); - - printf("\n"); - printf("Per Object Average " - "Min Max\n"); - printf("---------------------------------------" - "--------------------\n"); - - store_size(b1, avg_memobj);store_size(b2, min_memobj); - store_size(b3, max_memobj); - printf("Memory %15s %15s %15s\n", - b1, b2, b3); - store_size(b1, avg_objsize);store_size(b2, min_objsize); - store_size(b3, max_objsize); - printf("User %15s %15s %15s\n", - b1, b2, b3); - - store_size(b1, avg_objwaste);store_size(b2, min_objwaste); - store_size(b3, max_objwaste); - printf("Loss %15s %15s %15s\n", - b1, b2, b3); -} - -static void sort_slabs(void) -{ - struct slabinfo *s1,*s2; - - for (s1 = slabinfo; s1 < slabinfo + slabs; s1++) { - for (s2 = s1 + 1; s2 < slabinfo + slabs; s2++) { - int result; - - if (sort_size) { - if (slab_size(s1) == slab_size(s2)) - result = strcasecmp(s1->name, s2->name); - else - result = slab_size(s1) < slab_size(s2); - } else if (sort_active) { - if (slab_activity(s1) == slab_activity(s2)) - result = strcasecmp(s1->name, s2->name); - else - result = slab_activity(s1) < slab_activity(s2); - } else if (sort_loss) { - if (slab_waste(s1) == slab_waste(s2)) - result = strcasecmp(s1->name, s2->name); - else - result = slab_waste(s1) < slab_waste(s2); - } else if (sort_partial) { - if (s1->partial == s2->partial) - result = strcasecmp(s1->name, s2->name); - else - result = s1->partial < s2->partial; - } else - result = strcasecmp(s1->name, s2->name); - - if (show_inverted) - result = -result; - - if (result > 0) { - struct slabinfo t; - - memcpy(&t, s1, sizeof(struct slabinfo)); - memcpy(s1, s2, sizeof(struct slabinfo)); - memcpy(s2, &t, sizeof(struct slabinfo)); - } - } - } -} - -static void sort_aliases(void) -{ - struct aliasinfo *a1,*a2; - - for (a1 = aliasinfo; a1 < aliasinfo + aliases; a1++) { - for (a2 = a1 + 1; a2 < aliasinfo + aliases; a2++) { - char *n1, *n2; - - n1 = a1->name; - n2 = a2->name; - if (show_alias && !show_inverted) { - n1 = a1->ref; - n2 = a2->ref; - } - if (strcasecmp(n1, n2) > 0) { - struct aliasinfo t; - - memcpy(&t, a1, sizeof(struct aliasinfo)); - memcpy(a1, a2, sizeof(struct aliasinfo)); - memcpy(a2, &t, sizeof(struct aliasinfo)); - } - } - } -} - -static void link_slabs(void) -{ - struct aliasinfo *a; - struct slabinfo *s; - - for (a = aliasinfo; a < aliasinfo + aliases; a++) { - - for (s = slabinfo; s < slabinfo + slabs; s++) - if (strcmp(a->ref, s->name) == 0) { - a->slab = s; - s->refs++; - break; - } - if (s == slabinfo + slabs) - fatal("Unresolved alias %s\n", a->ref); - } -} - -static void alias(void) -{ - struct aliasinfo *a; - char *active = NULL; - - sort_aliases(); - link_slabs(); - - for(a = aliasinfo; a < aliasinfo + aliases; a++) { - - if (!show_single_ref && a->slab->refs == 1) - continue; - - if (!show_inverted) { - if (active) { - if (strcmp(a->slab->name, active) == 0) { - printf(" %s", a->name); - continue; - } - } - printf("\n%-12s <- %s", a->slab->name, a->name); - active = a->slab->name; - } - else - printf("%-15s -> %s\n", a->name, a->slab->name); - } - if (active) - printf("\n"); -} - - -static void rename_slabs(void) -{ - struct slabinfo *s; - struct aliasinfo *a; - - for (s = slabinfo; s < slabinfo + slabs; s++) { - if (*s->name != ':') - continue; - - if (s->refs > 1 && !show_first_alias) - continue; - - a = find_one_alias(s); - - if (a) - s->name = a->name; - else { - s->name = "*"; - actual_slabs--; - } - } -} - -static int slab_mismatch(char *slab) -{ - return regexec(&pattern, slab, 0, NULL, 0); -} - -static void read_slab_dir(void) -{ - DIR *dir; - struct dirent *de; - struct slabinfo *slab = slabinfo; - struct aliasinfo *alias = aliasinfo; - char *p; - char *t; - int count; - - if (chdir("/sys/kernel/slab") && chdir("/sys/slab")) - fatal("SYSFS support for SLUB not active\n"); - - dir = opendir("."); - while ((de = readdir(dir))) { - if (de->d_name[0] == '.' || - (de->d_name[0] != ':' && slab_mismatch(de->d_name))) - continue; - switch (de->d_type) { - case DT_LNK: - alias->name = strdup(de->d_name); - count = readlink(de->d_name, buffer, sizeof(buffer)-1); - - if (count < 0) - fatal("Cannot read symlink %s\n", de->d_name); - - buffer[count] = 0; - p = buffer + count; - while (p > buffer && p[-1] != '/') - p--; - alias->ref = strdup(p); - alias++; - break; - case DT_DIR: - if (chdir(de->d_name)) - fatal("Unable to access slab %s\n", slab->name); - slab->name = strdup(de->d_name); - slab->alias = 0; - slab->refs = 0; - slab->aliases = get_obj("aliases"); - slab->align = get_obj("align"); - slab->cache_dma = get_obj("cache_dma"); - slab->cpu_slabs = get_obj("cpu_slabs"); - slab->destroy_by_rcu = get_obj("destroy_by_rcu"); - slab->hwcache_align = get_obj("hwcache_align"); - slab->object_size = get_obj("object_size"); - slab->objects = get_obj("objects"); - slab->objects_partial = get_obj("objects_partial"); - slab->objects_total = get_obj("objects_total"); - slab->objs_per_slab = get_obj("objs_per_slab"); - slab->order = get_obj("order"); - slab->partial = get_obj("partial"); - slab->partial = get_obj_and_str("partial", &t); - decode_numa_list(slab->numa_partial, t); - free(t); - slab->poison = get_obj("poison"); - slab->reclaim_account = get_obj("reclaim_account"); - slab->red_zone = get_obj("red_zone"); - slab->sanity_checks = get_obj("sanity_checks"); - slab->slab_size = get_obj("slab_size"); - slab->slabs = get_obj_and_str("slabs", &t); - decode_numa_list(slab->numa, t); - free(t); - slab->store_user = get_obj("store_user"); - slab->trace = get_obj("trace"); - slab->alloc_fastpath = get_obj("alloc_fastpath"); - slab->alloc_slowpath = get_obj("alloc_slowpath"); - slab->free_fastpath = get_obj("free_fastpath"); - slab->free_slowpath = get_obj("free_slowpath"); - slab->free_frozen= get_obj("free_frozen"); - slab->free_add_partial = get_obj("free_add_partial"); - slab->free_remove_partial = get_obj("free_remove_partial"); - slab->alloc_from_partial = get_obj("alloc_from_partial"); - slab->alloc_slab = get_obj("alloc_slab"); - slab->alloc_refill = get_obj("alloc_refill"); - slab->free_slab = get_obj("free_slab"); - slab->cpuslab_flush = get_obj("cpuslab_flush"); - slab->deactivate_full = get_obj("deactivate_full"); - slab->deactivate_empty = get_obj("deactivate_empty"); - slab->deactivate_to_head = get_obj("deactivate_to_head"); - slab->deactivate_to_tail = get_obj("deactivate_to_tail"); - slab->deactivate_remote_frees = get_obj("deactivate_remote_frees"); - slab->order_fallback = get_obj("order_fallback"); - slab->cmpxchg_double_cpu_fail = get_obj("cmpxchg_double_cpu_fail"); - slab->cmpxchg_double_fail = get_obj("cmpxchg_double_fail"); - slab->cpu_partial_alloc = get_obj("cpu_partial_alloc"); - slab->cpu_partial_free = get_obj("cpu_partial_free"); - slab->alloc_node_mismatch = get_obj("alloc_node_mismatch"); - slab->deactivate_bypass = get_obj("deactivate_bypass"); - chdir(".."); - if (slab->name[0] == ':') - alias_targets++; - slab++; - break; - default : - fatal("Unknown file type %lx\n", de->d_type); - } - } - closedir(dir); - slabs = slab - slabinfo; - actual_slabs = slabs; - aliases = alias - aliasinfo; - if (slabs > MAX_SLABS) - fatal("Too many slabs\n"); - if (aliases > MAX_ALIASES) - fatal("Too many aliases\n"); -} - -static void output_slabs(void) -{ - struct slabinfo *slab; - int lines = output_lines; - - for (slab = slabinfo; (slab < slabinfo + slabs) && - lines != 0; slab++) { - - if (slab->alias) - continue; - - if (lines != -1) - lines--; - - if (show_numa) - slab_numa(slab, 0); - else if (show_track) - show_tracking(slab); - else if (validate) - slab_validate(slab); - else if (shrink) - slab_shrink(slab); - else if (set_debug) - slab_debug(slab); - else if (show_ops) - ops(slab); - else if (show_slab) - slabcache(slab); - else if (show_report) - report(slab); - } -} - -static void _xtotals(char *heading, char *underline, - int loss, int size, int partial) -{ - printf("%s%s", heading, underline); - line = 0; - sort_loss = loss; - sort_size = size; - sort_partial = partial; - sort_slabs(); - output_slabs(); -} - -static void xtotals(void) -{ - char *heading, *underline; - - totals(); - - link_slabs(); - rename_slabs(); - - heading = "\nSlabs sorted by size\n"; - underline = "--------------------\n"; - _xtotals(heading, underline, 0, 1, 0); - - heading = "\nSlabs sorted by loss\n"; - underline = "--------------------\n"; - _xtotals(heading, underline, 1, 0, 0); - - heading = "\nSlabs sorted by number of partial slabs\n"; - underline = "---------------------------------------\n"; - _xtotals(heading, underline, 0, 0, 1); - - printf("\n"); -} - -struct option opts[] = { - { "aliases", no_argument, NULL, 'a' }, - { "activity", no_argument, NULL, 'A' }, - { "Bytes", no_argument, NULL, 'B'}, - { "debug", optional_argument, NULL, 'd' }, - { "display-activity", no_argument, NULL, 'D' }, - { "empty", no_argument, NULL, 'e' }, - { "first-alias", no_argument, NULL, 'f' }, - { "help", no_argument, NULL, 'h' }, - { "inverted", no_argument, NULL, 'i'}, - { "slabs", no_argument, NULL, 'l' }, - { "Loss", no_argument, NULL, 'L'}, - { "numa", no_argument, NULL, 'n' }, - { "lines", required_argument, NULL, 'N'}, - { "ops", no_argument, NULL, 'o' }, - { "partial", no_argument, NULL, 'p'}, - { "report", no_argument, NULL, 'r' }, - { "shrink", no_argument, NULL, 's' }, - { "Size", no_argument, NULL, 'S'}, - { "tracking", no_argument, NULL, 't'}, - { "Totals", no_argument, NULL, 'T'}, - { "Unreclaim", no_argument, NULL, 'U'}, - { "validate", no_argument, NULL, 'v' }, - { "Xtotals", no_argument, NULL, 'X'}, - { "zero", no_argument, NULL, 'z' }, - { "1ref", no_argument, NULL, '1'}, - { NULL, 0, NULL, 0 } -}; - -int main(int argc, char *argv[]) -{ - int c; - int err; - char *pattern_source; - - page_size = getpagesize(); - - while ((c = getopt_long(argc, argv, "aABd::DefhilLnN:oPrsStTUvXz1", - opts, NULL)) != -1) - switch (c) { - case 'a': - show_alias = 1; - break; - case 'A': - sort_active = 1; - break; - case 'B': - show_bytes = 1; - break; - case 'd': - set_debug = 1; - if (!debug_opt_scan(optarg)) - fatal("Invalid debug option '%s'\n", optarg); - break; - case 'D': - show_activity = 1; - break; - case 'e': - show_empty = 1; - break; - case 'f': - show_first_alias = 1; - break; - case 'h': - usage(); - return 0; - case 'i': - show_inverted = 1; - break; - case 'l': - show_slab = 1; - break; - case 'L': - sort_loss = 1; - break; - case 'n': - show_numa = 1; - break; - case 'N': - if (optarg) { - output_lines = atoi(optarg); - if (output_lines < 1) - output_lines = 1; - } - break; - case 'o': - show_ops = 1; - break; - case 'r': - show_report = 1; - break; - case 'P': - sort_partial = 1; - break; - case 's': - shrink = 1; - break; - case 'S': - sort_size = 1; - break; - case 't': - show_track = 1; - break; - case 'T': - show_totals = 1; - break; - case 'U': - unreclaim_only = 1; - break; - case 'v': - validate = 1; - break; - case 'X': - if (output_lines == -1) - output_lines = 1; - extended_totals = 1; - show_bytes = 1; - break; - case 'z': - skip_zero = 0; - break; - case '1': - show_single_ref = 1; - break; - default: - fatal("%s: Invalid option '%c'\n", argv[0], optopt); - - } - - if (!show_slab && !show_alias && !show_track && !show_report - && !validate && !shrink && !set_debug && !show_ops) - show_slab = 1; - - if (argc > optind) - pattern_source = argv[optind]; - else - pattern_source = ".*"; - - err = regcomp(&pattern, pattern_source, REG_ICASE|REG_NOSUB); - if (err) - fatal("%s: Invalid pattern '%s' code %d\n", - argv[0], pattern_source, err); - read_slab_dir(); - if (show_alias) { - alias(); - } else if (extended_totals) { - xtotals(); - } else if (show_totals) { - totals(); - } else { - link_slabs(); - rename_slabs(); - sort_slabs(); - output_slabs(); - } - return 0; -} -- cgit v1.2.3 From baa489fabd01596d5426d6e112b34ba5fb59ab82 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 3 Jan 2023 18:07:53 +0000 Subject: selftests/vm: rename selftests/vm to selftests/mm Rename selftets/vm to selftests/mm for being more consistent with the code, documentation, and tools directories, and won't be confused with virtual machines. [sj@kernel.org: convert missing vm->mm changes] Link: https://lkml.kernel.org/r/20230107230643.252273-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230103180754.129637-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/hugetlbpage.rst | 6 +- Documentation/core-api/pin_user_pages.rst | 2 +- MAINTAINERS | 4 +- mm/Kconfig | 2 +- tools/testing/selftests/Makefile | 2 +- tools/testing/selftests/kselftest_deps.sh | 6 +- tools/testing/selftests/mm/.gitignore | 38 + tools/testing/selftests/mm/Makefile | 180 ++ .../selftests/mm/charge_reserved_hugetlb.sh | 584 ++++++ tools/testing/selftests/mm/check_config.sh | 31 + tools/testing/selftests/mm/compaction_test.c | 231 +++ tools/testing/selftests/mm/config | 8 + tools/testing/selftests/mm/cow.c | 1764 +++++++++++++++++ tools/testing/selftests/mm/gup_test.c | 271 +++ tools/testing/selftests/mm/hmm-tests.c | 2054 ++++++++++++++++++++ tools/testing/selftests/mm/hugepage-mmap.c | 91 + tools/testing/selftests/mm/hugepage-mremap.c | 188 ++ tools/testing/selftests/mm/hugepage-shm.c | 101 + tools/testing/selftests/mm/hugepage-vmemmap.c | 144 ++ tools/testing/selftests/mm/hugetlb-madvise.c | 406 ++++ .../selftests/mm/hugetlb_reparenting_test.sh | 252 +++ tools/testing/selftests/mm/khugepaged.c | 1558 +++++++++++++++ tools/testing/selftests/mm/ksm_functional_tests.c | 279 +++ tools/testing/selftests/mm/ksm_tests.c | 849 ++++++++ tools/testing/selftests/mm/madv_populate.c | 296 +++ tools/testing/selftests/mm/map_fixed_noreplace.c | 231 +++ tools/testing/selftests/mm/map_hugetlb.c | 109 ++ tools/testing/selftests/mm/map_populate.c | 113 ++ tools/testing/selftests/mm/memfd_secret.c | 296 +++ tools/testing/selftests/mm/migration.c | 193 ++ tools/testing/selftests/mm/mlock-random-test.c | 294 +++ tools/testing/selftests/mm/mlock2-tests.c | 520 +++++ tools/testing/selftests/mm/mlock2.h | 63 + tools/testing/selftests/mm/mrelease_test.c | 206 ++ tools/testing/selftests/mm/mremap_dontunmap.c | 364 ++++ tools/testing/selftests/mm/mremap_test.c | 475 +++++ tools/testing/selftests/mm/on-fault-limit.c | 48 + tools/testing/selftests/mm/pkey-helpers.h | 226 +++ tools/testing/selftests/mm/pkey-powerpc.h | 133 ++ tools/testing/selftests/mm/pkey-x86.h | 177 ++ tools/testing/selftests/mm/protection_keys.c | 1788 +++++++++++++++++ tools/testing/selftests/mm/run_vmtests.sh | 274 +++ tools/testing/selftests/mm/settings | 1 + tools/testing/selftests/mm/soft-dirty.c | 210 ++ tools/testing/selftests/mm/split_huge_page_test.c | 309 +++ tools/testing/selftests/mm/test_hmm.sh | 105 + tools/testing/selftests/mm/test_vmalloc.sh | 177 ++ tools/testing/selftests/mm/thuge-gen.c | 257 +++ tools/testing/selftests/mm/transhuge-stress.c | 122 ++ tools/testing/selftests/mm/userfaultfd.c | 1858 ++++++++++++++++++ tools/testing/selftests/mm/util.h | 69 + tools/testing/selftests/mm/va_128TBswitch.c | 289 +++ tools/testing/selftests/mm/va_128TBswitch.sh | 54 + tools/testing/selftests/mm/virtual_address_range.c | 139 ++ tools/testing/selftests/mm/vm_util.c | 151 ++ tools/testing/selftests/mm/vm_util.h | 15 + tools/testing/selftests/mm/write_hugetlb_memory.sh | 23 + tools/testing/selftests/mm/write_to_hugetlbfs.c | 240 +++ tools/testing/selftests/vm/.gitignore | 38 - tools/testing/selftests/vm/Makefile | 180 -- .../selftests/vm/charge_reserved_hugetlb.sh | 584 ------ tools/testing/selftests/vm/check_config.sh | 31 - tools/testing/selftests/vm/compaction_test.c | 231 --- tools/testing/selftests/vm/config | 8 - tools/testing/selftests/vm/cow.c | 1764 ----------------- tools/testing/selftests/vm/gup_test.c | 271 --- tools/testing/selftests/vm/hmm-tests.c | 2054 -------------------- tools/testing/selftests/vm/hugepage-mmap.c | 91 - tools/testing/selftests/vm/hugepage-mremap.c | 188 -- tools/testing/selftests/vm/hugepage-shm.c | 101 - tools/testing/selftests/vm/hugepage-vmemmap.c | 144 -- tools/testing/selftests/vm/hugetlb-madvise.c | 406 ---- .../selftests/vm/hugetlb_reparenting_test.sh | 252 --- tools/testing/selftests/vm/khugepaged.c | 1558 --------------- tools/testing/selftests/vm/ksm_functional_tests.c | 279 --- tools/testing/selftests/vm/ksm_tests.c | 849 -------- tools/testing/selftests/vm/madv_populate.c | 296 --- tools/testing/selftests/vm/map_fixed_noreplace.c | 231 --- tools/testing/selftests/vm/map_hugetlb.c | 109 -- tools/testing/selftests/vm/map_populate.c | 113 -- tools/testing/selftests/vm/memfd_secret.c | 296 --- tools/testing/selftests/vm/migration.c | 193 -- tools/testing/selftests/vm/mlock-random-test.c | 294 --- tools/testing/selftests/vm/mlock2-tests.c | 520 ----- tools/testing/selftests/vm/mlock2.h | 63 - tools/testing/selftests/vm/mrelease_test.c | 206 -- tools/testing/selftests/vm/mremap_dontunmap.c | 364 ---- tools/testing/selftests/vm/mremap_test.c | 475 ----- tools/testing/selftests/vm/on-fault-limit.c | 48 - tools/testing/selftests/vm/pkey-helpers.h | 226 --- tools/testing/selftests/vm/pkey-powerpc.h | 133 -- tools/testing/selftests/vm/pkey-x86.h | 177 -- tools/testing/selftests/vm/protection_keys.c | 1788 ----------------- tools/testing/selftests/vm/run_vmtests.sh | 274 --- tools/testing/selftests/vm/settings | 1 - tools/testing/selftests/vm/soft-dirty.c | 210 -- tools/testing/selftests/vm/split_huge_page_test.c | 309 --- tools/testing/selftests/vm/test_hmm.sh | 105 - tools/testing/selftests/vm/test_vmalloc.sh | 177 -- tools/testing/selftests/vm/thuge-gen.c | 257 --- tools/testing/selftests/vm/transhuge-stress.c | 122 -- tools/testing/selftests/vm/userfaultfd.c | 1858 ------------------ tools/testing/selftests/vm/util.h | 69 - tools/testing/selftests/vm/va_128TBswitch.c | 289 --- tools/testing/selftests/vm/va_128TBswitch.sh | 54 - tools/testing/selftests/vm/virtual_address_range.c | 139 -- tools/testing/selftests/vm/vm_util.c | 151 -- tools/testing/selftests/vm/vm_util.h | 15 - tools/testing/selftests/vm/write_hugetlb_memory.sh | 23 - tools/testing/selftests/vm/write_to_hugetlbfs.c | 240 --- 110 files changed, 18865 insertions(+), 18865 deletions(-) create mode 100644 tools/testing/selftests/mm/.gitignore create mode 100644 tools/testing/selftests/mm/Makefile create mode 100644 tools/testing/selftests/mm/charge_reserved_hugetlb.sh create mode 100644 tools/testing/selftests/mm/check_config.sh create mode 100644 tools/testing/selftests/mm/compaction_test.c create mode 100644 tools/testing/selftests/mm/config create mode 100644 tools/testing/selftests/mm/cow.c create mode 100644 tools/testing/selftests/mm/gup_test.c create mode 100644 tools/testing/selftests/mm/hmm-tests.c create mode 100644 tools/testing/selftests/mm/hugepage-mmap.c create mode 100644 tools/testing/selftests/mm/hugepage-mremap.c create mode 100644 tools/testing/selftests/mm/hugepage-shm.c create mode 100644 tools/testing/selftests/mm/hugepage-vmemmap.c create mode 100644 tools/testing/selftests/mm/hugetlb-madvise.c create mode 100644 tools/testing/selftests/mm/hugetlb_reparenting_test.sh create mode 100644 tools/testing/selftests/mm/khugepaged.c create mode 100644 tools/testing/selftests/mm/ksm_functional_tests.c create mode 100644 tools/testing/selftests/mm/ksm_tests.c create mode 100644 tools/testing/selftests/mm/madv_populate.c create mode 100644 tools/testing/selftests/mm/map_fixed_noreplace.c create mode 100644 tools/testing/selftests/mm/map_hugetlb.c create mode 100644 tools/testing/selftests/mm/map_populate.c create mode 100644 tools/testing/selftests/mm/memfd_secret.c create mode 100644 tools/testing/selftests/mm/migration.c create mode 100644 tools/testing/selftests/mm/mlock-random-test.c create mode 100644 tools/testing/selftests/mm/mlock2-tests.c create mode 100644 tools/testing/selftests/mm/mlock2.h create mode 100644 tools/testing/selftests/mm/mrelease_test.c create mode 100644 tools/testing/selftests/mm/mremap_dontunmap.c create mode 100644 tools/testing/selftests/mm/mremap_test.c create mode 100644 tools/testing/selftests/mm/on-fault-limit.c create mode 100644 tools/testing/selftests/mm/pkey-helpers.h create mode 100644 tools/testing/selftests/mm/pkey-powerpc.h create mode 100644 tools/testing/selftests/mm/pkey-x86.h create mode 100644 tools/testing/selftests/mm/protection_keys.c create mode 100644 tools/testing/selftests/mm/run_vmtests.sh create mode 100644 tools/testing/selftests/mm/settings create mode 100644 tools/testing/selftests/mm/soft-dirty.c create mode 100644 tools/testing/selftests/mm/split_huge_page_test.c create mode 100644 tools/testing/selftests/mm/test_hmm.sh create mode 100644 tools/testing/selftests/mm/test_vmalloc.sh create mode 100644 tools/testing/selftests/mm/thuge-gen.c create mode 100644 tools/testing/selftests/mm/transhuge-stress.c create mode 100644 tools/testing/selftests/mm/userfaultfd.c create mode 100644 tools/testing/selftests/mm/util.h create mode 100644 tools/testing/selftests/mm/va_128TBswitch.c create mode 100644 tools/testing/selftests/mm/va_128TBswitch.sh create mode 100644 tools/testing/selftests/mm/virtual_address_range.c create mode 100644 tools/testing/selftests/mm/vm_util.c create mode 100644 tools/testing/selftests/mm/vm_util.h create mode 100644 tools/testing/selftests/mm/write_hugetlb_memory.sh create mode 100644 tools/testing/selftests/mm/write_to_hugetlbfs.c delete mode 100644 tools/testing/selftests/vm/.gitignore delete mode 100644 tools/testing/selftests/vm/Makefile delete mode 100644 tools/testing/selftests/vm/charge_reserved_hugetlb.sh delete mode 100644 tools/testing/selftests/vm/check_config.sh delete mode 100644 tools/testing/selftests/vm/compaction_test.c delete mode 100644 tools/testing/selftests/vm/config delete mode 100644 tools/testing/selftests/vm/cow.c delete mode 100644 tools/testing/selftests/vm/gup_test.c delete mode 100644 tools/testing/selftests/vm/hmm-tests.c delete mode 100644 tools/testing/selftests/vm/hugepage-mmap.c delete mode 100644 tools/testing/selftests/vm/hugepage-mremap.c delete mode 100644 tools/testing/selftests/vm/hugepage-shm.c delete mode 100644 tools/testing/selftests/vm/hugepage-vmemmap.c delete mode 100644 tools/testing/selftests/vm/hugetlb-madvise.c delete mode 100644 tools/testing/selftests/vm/hugetlb_reparenting_test.sh delete mode 100644 tools/testing/selftests/vm/khugepaged.c delete mode 100644 tools/testing/selftests/vm/ksm_functional_tests.c delete mode 100644 tools/testing/selftests/vm/ksm_tests.c delete mode 100644 tools/testing/selftests/vm/madv_populate.c delete mode 100644 tools/testing/selftests/vm/map_fixed_noreplace.c delete mode 100644 tools/testing/selftests/vm/map_hugetlb.c delete mode 100644 tools/testing/selftests/vm/map_populate.c delete mode 100644 tools/testing/selftests/vm/memfd_secret.c delete mode 100644 tools/testing/selftests/vm/migration.c delete mode 100644 tools/testing/selftests/vm/mlock-random-test.c delete mode 100644 tools/testing/selftests/vm/mlock2-tests.c delete mode 100644 tools/testing/selftests/vm/mlock2.h delete mode 100644 tools/testing/selftests/vm/mrelease_test.c delete mode 100644 tools/testing/selftests/vm/mremap_dontunmap.c delete mode 100644 tools/testing/selftests/vm/mremap_test.c delete mode 100644 tools/testing/selftests/vm/on-fault-limit.c delete mode 100644 tools/testing/selftests/vm/pkey-helpers.h delete mode 100644 tools/testing/selftests/vm/pkey-powerpc.h delete mode 100644 tools/testing/selftests/vm/pkey-x86.h delete mode 100644 tools/testing/selftests/vm/protection_keys.c delete mode 100755 tools/testing/selftests/vm/run_vmtests.sh delete mode 100644 tools/testing/selftests/vm/settings delete mode 100644 tools/testing/selftests/vm/soft-dirty.c delete mode 100644 tools/testing/selftests/vm/split_huge_page_test.c delete mode 100755 tools/testing/selftests/vm/test_hmm.sh delete mode 100755 tools/testing/selftests/vm/test_vmalloc.sh delete mode 100644 tools/testing/selftests/vm/thuge-gen.c delete mode 100644 tools/testing/selftests/vm/transhuge-stress.c delete mode 100644 tools/testing/selftests/vm/userfaultfd.c delete mode 100644 tools/testing/selftests/vm/util.h delete mode 100644 tools/testing/selftests/vm/va_128TBswitch.c delete mode 100755 tools/testing/selftests/vm/va_128TBswitch.sh delete mode 100644 tools/testing/selftests/vm/virtual_address_range.c delete mode 100644 tools/testing/selftests/vm/vm_util.c delete mode 100644 tools/testing/selftests/vm/vm_util.h delete mode 100644 tools/testing/selftests/vm/write_hugetlb_memory.sh delete mode 100644 tools/testing/selftests/vm/write_to_hugetlbfs.c (limited to 'tools') diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst index 19f27c0d92e0..a969a2c742b2 100644 --- a/Documentation/admin-guide/mm/hugetlbpage.rst +++ b/Documentation/admin-guide/mm/hugetlbpage.rst @@ -461,13 +461,13 @@ Examples .. _map_hugetlb: ``map_hugetlb`` - see tools/testing/selftests/vm/map_hugetlb.c + see tools/testing/selftests/mm/map_hugetlb.c ``hugepage-shm`` - see tools/testing/selftests/vm/hugepage-shm.c + see tools/testing/selftests/mm/hugepage-shm.c ``hugepage-mmap`` - see tools/testing/selftests/vm/hugepage-mmap.c + see tools/testing/selftests/mm/hugepage-mmap.c The `libhugetlbfs`_ library provides a wide range of userspace tools to help with huge page usability, environment setup, and control. diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index b18416f4500f..facafbdecb95 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -221,7 +221,7 @@ Unit testing ============ This file:: - tools/testing/selftests/vm/gup_test.c + tools/testing/selftests/mm/gup_test.c has the following new calls to exercise the new pin*() wrapper functions: diff --git a/MAINTAINERS b/MAINTAINERS index c726adfd1f0d..8ac1472bea34 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9466,7 +9466,7 @@ F: Documentation/mm/hmm.rst F: include/linux/hmm* F: lib/test_hmm* F: mm/hmm* -F: tools/testing/selftests/vm/*hmm* +F: tools/testing/selftests/mm/*hmm* HOST AP DRIVER M: Jouni Malinen @@ -13484,7 +13484,7 @@ F: include/linux/mmzone.h F: include/linux/pagewalk.h F: mm/ F: tools/mm/ -F: tools/testing/selftests/vm/ +F: tools/testing/selftests/mm/ VMALLOC M: Andrew Morton diff --git a/mm/Kconfig b/mm/Kconfig index ff7b209dec05..39df30dcabe3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1073,7 +1073,7 @@ config GUP_TEST pin_user_pages*(), or pinned via get_user_pages*(), as specified by other command line arguments. - See tools/testing/selftests/vm/gup_test.c + See tools/testing/selftests/mm/gup_test.c comment "GUP_TEST needs to have DEBUG_FS enabled" depends on !GUP_TEST && !DEBUG_FS diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 41b649452560..56a29f2de8e6 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -85,7 +85,7 @@ TARGETS += tmpfs TARGETS += tpm2 TARGETS += user TARGETS += vDSO -TARGETS += vm +TARGETS += mm TARGETS += x86 TARGETS += zram #Please keep the TARGETS list alphabetically sorted diff --git a/tools/testing/selftests/kselftest_deps.sh b/tools/testing/selftests/kselftest_deps.sh index 7424a1f5babc..4bc14d9e8ff1 100755 --- a/tools/testing/selftests/kselftest_deps.sh +++ b/tools/testing/selftests/kselftest_deps.sh @@ -12,9 +12,9 @@ usage() echo -e "Usage: $0 -[p] [test_name]\n" echo -e "\tkselftest_deps.sh [-p] gcc" -echo -e "\tkselftest_deps.sh [-p] gcc vm" +echo -e "\tkselftest_deps.sh [-p] gcc mm" echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc" -echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc vm\n" +echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc mm\n" echo "- Should be run in selftests directory in the kernel repo." echo "- Checks if Kselftests can be built/cross-built on a system." echo "- Parses all test/sub-test Makefile to find library dependencies." @@ -120,7 +120,7 @@ l1_tests=$(grep -r --include=Makefile "^LDLIBS" | \ # Level 2 # Some tests have multiple valid LDLIBS lines for individual sub-tests # that need dependency checks. Find them and append them to the tests -# e.g: vm/Makefile:$(OUTPUT)/userfaultfd: LDLIBS += -lpthread +# e.g: mm/Makefile:$(OUTPUT)/userfaultfd: LDLIBS += -lpthread # Filter out VAR_LDLIBS to discard the following: # memfd/Makefile:$(OUTPUT)/fuse_mnt: LDLIBS += $(VAR_LDLIBS) # Append space at the end of the list to append more tests. diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore new file mode 100644 index 000000000000..1f8c36a9fa10 --- /dev/null +++ b/tools/testing/selftests/mm/.gitignore @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: GPL-2.0-only +cow +hugepage-mmap +hugepage-mremap +hugepage-shm +hugepage-vmemmap +hugetlb-madvise +khugepaged +map_hugetlb +map_populate +thuge-gen +compaction_test +migration +mlock2-tests +mrelease_test +mremap_dontunmap +mremap_test +on-fault-limit +transhuge-stress +protection_keys +protection_keys_32 +protection_keys_64 +madv_populate +userfaultfd +mlock-intersect-test +mlock-random-test +virtual_address_range +gup_test +va_128TBswitch +map_fixed_noreplace +write_to_hugetlbfs +hmm-tests +memfd_secret +soft-dirty +split_huge_page_test +ksm_tests +local_config.h +local_config.mk diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile new file mode 100644 index 000000000000..6a4b639b2b2b --- /dev/null +++ b/tools/testing/selftests/mm/Makefile @@ -0,0 +1,180 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for mm selftests + +LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h + +include local_config.mk + +uname_M := $(shell uname -m 2>/dev/null || echo not) +MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') + +# Without this, failed build products remain, with up-to-date timestamps, +# thus tricking Make (and you!) into believing that All Is Well, in subsequent +# make invocations: +.DELETE_ON_ERROR: + +# Avoid accidental wrong builds, due to built-in rules working just a little +# bit too well--but not quite as well as required for our situation here. +# +# In other words, "make userfaultfd" is supposed to fail to build at all, +# because this Makefile only supports either "make" (all), or "make /full/path". +# However, the built-in rules, if not suppressed, will pick up CFLAGS and the +# initial LDLIBS (but not the target-specific LDLIBS, because those are only +# set for the full path target!). This causes it to get pretty far into building +# things despite using incorrect values such as an *occasionally* incomplete +# LDLIBS. +MAKEFLAGS += --no-builtin-rules + +CFLAGS = -Wall -I $(top_srcdir) -I $(top_srcdir)/usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES) +LDLIBS = -lrt -lpthread +TEST_GEN_FILES = cow +TEST_GEN_FILES += compaction_test +TEST_GEN_FILES += gup_test +TEST_GEN_FILES += hmm-tests +TEST_GEN_FILES += hugetlb-madvise +TEST_GEN_FILES += hugepage-mmap +TEST_GEN_FILES += hugepage-mremap +TEST_GEN_FILES += hugepage-shm +TEST_GEN_FILES += hugepage-vmemmap +TEST_GEN_FILES += khugepaged +TEST_GEN_PROGS = madv_populate +TEST_GEN_FILES += map_fixed_noreplace +TEST_GEN_FILES += map_hugetlb +TEST_GEN_FILES += map_populate +TEST_GEN_FILES += memfd_secret +TEST_GEN_FILES += migration +TEST_GEN_FILES += mlock-random-test +TEST_GEN_FILES += mlock2-tests +TEST_GEN_FILES += mrelease_test +TEST_GEN_FILES += mremap_dontunmap +TEST_GEN_FILES += mremap_test +TEST_GEN_FILES += on-fault-limit +TEST_GEN_FILES += thuge-gen +TEST_GEN_FILES += transhuge-stress +TEST_GEN_FILES += userfaultfd +TEST_GEN_PROGS += soft-dirty +TEST_GEN_PROGS += split_huge_page_test +TEST_GEN_FILES += ksm_tests +TEST_GEN_PROGS += ksm_functional_tests + +ifeq ($(MACHINE),x86_64) +CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32) +CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_program.c) +CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie) + +VMTARGETS := protection_keys +BINARIES_32 := $(VMTARGETS:%=%_32) +BINARIES_64 := $(VMTARGETS:%=%_64) + +ifeq ($(CAN_BUILD_WITH_NOPIE),1) +CFLAGS += -no-pie +endif + +ifeq ($(CAN_BUILD_I386),1) +TEST_GEN_FILES += $(BINARIES_32) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +TEST_GEN_FILES += $(BINARIES_64) +endif +else + +ifneq (,$(findstring $(MACHINE),ppc64)) +TEST_GEN_FILES += protection_keys +endif + +endif + +ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64)) +TEST_GEN_FILES += va_128TBswitch +TEST_GEN_FILES += virtual_address_range +TEST_GEN_FILES += write_to_hugetlbfs +endif + +TEST_PROGS := run_vmtests.sh + +TEST_FILES := test_vmalloc.sh +TEST_FILES += test_hmm.sh +TEST_FILES += va_128TBswitch.sh + +include ../lib.mk + +$(OUTPUT)/cow: vm_util.c +$(OUTPUT)/khugepaged: vm_util.c +$(OUTPUT)/ksm_functional_tests: vm_util.c +$(OUTPUT)/madv_populate: vm_util.c +$(OUTPUT)/soft-dirty: vm_util.c +$(OUTPUT)/split_huge_page_test: vm_util.c +$(OUTPUT)/userfaultfd: vm_util.c + +ifeq ($(MACHINE),x86_64) +BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) +BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) + +define gen-target-rule-32 +$(1) $(1)_32: $(OUTPUT)/$(1)_32 +.PHONY: $(1) $(1)_32 +endef + +define gen-target-rule-64 +$(1) $(1)_64: $(OUTPUT)/$(1)_64 +.PHONY: $(1) $(1)_64 +endef + +ifeq ($(CAN_BUILD_I386),1) +$(BINARIES_32): CFLAGS += -m32 -mxsave +$(BINARIES_32): LDLIBS += -lrt -ldl -lm +$(BINARIES_32): $(OUTPUT)/%_32: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-32,$(t)))) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +$(BINARIES_64): CFLAGS += -m64 -mxsave +$(BINARIES_64): LDLIBS += -lrt -ldl +$(BINARIES_64): $(OUTPUT)/%_64: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-64,$(t)))) +endif + +# x86_64 users should be encouraged to install 32-bit libraries +ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01) +all: warn_32bit_failure + +warn_32bit_failure: + @echo "Warning: you seem to have a broken 32-bit build" 2>&1; \ + echo "environment. This will reduce test coverage of 64-bit" 2>&1; \ + echo "kernels. If you are using a Debian-like distribution," 2>&1; \ + echo "try:"; 2>&1; \ + echo ""; \ + echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \ + echo ""; \ + echo "If you are using a Fedora-like distribution, try:"; \ + echo ""; \ + echo " yum install glibc-devel.*i686"; \ + exit 0; +endif +endif + +# cow_EXTRA_LIBS may get set in local_config.mk, or it may be left empty. +$(OUTPUT)/cow: LDLIBS += $(COW_EXTRA_LIBS) + +$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap + +$(OUTPUT)/ksm_tests: LDLIBS += -lnuma + +$(OUTPUT)/migration: LDLIBS += -lnuma + +local_config.mk local_config.h: check_config.sh + /bin/sh ./check_config.sh $(CC) + +EXTRA_CLEAN += local_config.mk local_config.h + +ifeq ($(COW_EXTRA_LIBS),) +all: warn_missing_liburing + +warn_missing_liburing: + @echo ; \ + echo "Warning: missing liburing support. Some COW tests will be skipped." ; \ + echo +endif diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh new file mode 100644 index 000000000000..a5cb4b09a46c --- /dev/null +++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh @@ -0,0 +1,584 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +set -e + +if [[ $(id -u) -ne 0 ]]; then + echo "This test must be run as root. Skipping..." + exit $ksft_skip +fi + +fault_limit_file=limit_in_bytes +reservation_limit_file=rsvd.limit_in_bytes +fault_usage_file=usage_in_bytes +reservation_usage_file=rsvd.usage_in_bytes + +if [[ "$1" == "-cgroup-v2" ]]; then + cgroup2=1 + fault_limit_file=max + reservation_limit_file=rsvd.max + fault_usage_file=current + reservation_usage_file=rsvd.current +fi + +if [[ $cgroup2 ]]; then + cgroup_path=$(mount -t cgroup2 | head -1 | awk -e '{print $3}') + if [[ -z "$cgroup_path" ]]; then + cgroup_path=/dev/cgroup/memory + mount -t cgroup2 none $cgroup_path + do_umount=1 + fi + echo "+hugetlb" >$cgroup_path/cgroup.subtree_control +else + cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}') + if [[ -z "$cgroup_path" ]]; then + cgroup_path=/dev/cgroup/memory + mount -t cgroup memory,hugetlb $cgroup_path + do_umount=1 + fi +fi +export cgroup_path + +function cleanup() { + if [[ $cgroup2 ]]; then + echo $$ >$cgroup_path/cgroup.procs + else + echo $$ >$cgroup_path/tasks + fi + + if [[ -e /mnt/huge ]]; then + rm -rf /mnt/huge/* + umount /mnt/huge || echo error + rmdir /mnt/huge + fi + if [[ -e $cgroup_path/hugetlb_cgroup_test ]]; then + rmdir $cgroup_path/hugetlb_cgroup_test + fi + if [[ -e $cgroup_path/hugetlb_cgroup_test1 ]]; then + rmdir $cgroup_path/hugetlb_cgroup_test1 + fi + if [[ -e $cgroup_path/hugetlb_cgroup_test2 ]]; then + rmdir $cgroup_path/hugetlb_cgroup_test2 + fi + echo 0 >/proc/sys/vm/nr_hugepages + echo CLEANUP DONE +} + +function expect_equal() { + local expected="$1" + local actual="$2" + local error="$3" + + if [[ "$expected" != "$actual" ]]; then + echo "expected ($expected) != actual ($actual): $3" + cleanup + exit 1 + fi +} + +function get_machine_hugepage_size() { + hpz=$(grep -i hugepagesize /proc/meminfo) + kb=${hpz:14:-3} + mb=$(($kb / 1024)) + echo $mb +} + +MB=$(get_machine_hugepage_size) + +function setup_cgroup() { + local name="$1" + local cgroup_limit="$2" + local reservation_limit="$3" + + mkdir $cgroup_path/$name + + echo writing cgroup limit: "$cgroup_limit" + echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file + + echo writing reseravation limit: "$reservation_limit" + echo "$reservation_limit" > \ + $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file + + if [ -e "$cgroup_path/$name/cpuset.cpus" ]; then + echo 0 >$cgroup_path/$name/cpuset.cpus + fi + if [ -e "$cgroup_path/$name/cpuset.mems" ]; then + echo 0 >$cgroup_path/$name/cpuset.mems + fi +} + +function wait_for_hugetlb_memory_to_get_depleted() { + local cgroup="$1" + local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" + # Wait for hugetlbfs memory to get depleted. + while [ $(cat $path) != 0 ]; do + echo Waiting for hugetlb memory to get depleted. + cat $path + sleep 0.5 + done +} + +function wait_for_hugetlb_memory_to_get_reserved() { + local cgroup="$1" + local size="$2" + + local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" + # Wait for hugetlbfs memory to get written. + while [ $(cat $path) != $size ]; do + echo Waiting for hugetlb memory reservation to reach size $size. + cat $path + sleep 0.5 + done +} + +function wait_for_hugetlb_memory_to_get_written() { + local cgroup="$1" + local size="$2" + + local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file" + # Wait for hugetlbfs memory to get written. + while [ $(cat $path) != $size ]; do + echo Waiting for hugetlb memory to reach size $size. + cat $path + sleep 0.5 + done +} + +function write_hugetlbfs_and_get_usage() { + local cgroup="$1" + local size="$2" + local populate="$3" + local write="$4" + local path="$5" + local method="$6" + local private="$7" + local expect_failure="$8" + local reserve="$9" + + # Function return values. + reservation_failed=0 + oom_killed=0 + hugetlb_difference=0 + reserved_difference=0 + + local hugetlb_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file + local reserved_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file + + local hugetlb_before=$(cat $hugetlb_usage) + local reserved_before=$(cat $reserved_usage) + + echo + echo Starting: + echo hugetlb_usage="$hugetlb_before" + echo reserved_usage="$reserved_before" + echo expect_failure is "$expect_failure" + + output=$(mktemp) + set +e + if [[ "$method" == "1" ]] || [[ "$method" == 2 ]] || + [[ "$private" == "-r" ]] && [[ "$expect_failure" != 1 ]]; then + + bash write_hugetlb_memory.sh "$size" "$populate" "$write" \ + "$cgroup" "$path" "$method" "$private" "-l" "$reserve" 2>&1 | tee $output & + + local write_result=$? + local write_pid=$! + + until grep -q -i "DONE" $output; do + echo waiting for DONE signal. + if ! ps $write_pid > /dev/null + then + echo "FAIL: The write died" + cleanup + exit 1 + fi + sleep 0.5 + done + + echo ================= write_hugetlb_memory.sh output is: + cat $output + echo ================= end output. + + if [[ "$populate" == "-o" ]] || [[ "$write" == "-w" ]]; then + wait_for_hugetlb_memory_to_get_written "$cgroup" "$size" + elif [[ "$reserve" != "-n" ]]; then + wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size" + else + # This case doesn't produce visible effects, but we still have + # to wait for the async process to start and execute... + sleep 0.5 + fi + + echo write_result is $write_result + else + bash write_hugetlb_memory.sh "$size" "$populate" "$write" \ + "$cgroup" "$path" "$method" "$private" "$reserve" + local write_result=$? + + if [[ "$reserve" != "-n" ]]; then + wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size" + fi + fi + set -e + + if [[ "$write_result" == 1 ]]; then + reservation_failed=1 + fi + + # On linus/master, the above process gets SIGBUS'd on oomkill, with + # return code 135. On earlier kernels, it gets actual oomkill, with return + # code 137, so just check for both conditions in case we're testing + # against an earlier kernel. + if [[ "$write_result" == 135 ]] || [[ "$write_result" == 137 ]]; then + oom_killed=1 + fi + + local hugetlb_after=$(cat $hugetlb_usage) + local reserved_after=$(cat $reserved_usage) + + echo After write: + echo hugetlb_usage="$hugetlb_after" + echo reserved_usage="$reserved_after" + + hugetlb_difference=$(($hugetlb_after - $hugetlb_before)) + reserved_difference=$(($reserved_after - $reserved_before)) +} + +function cleanup_hugetlb_memory() { + set +e + local cgroup="$1" + if [[ "$(pgrep -f write_to_hugetlbfs)" != "" ]]; then + echo killing write_to_hugetlbfs + killall -2 write_to_hugetlbfs + wait_for_hugetlb_memory_to_get_depleted $cgroup + fi + set -e + + if [[ -e /mnt/huge ]]; then + rm -rf /mnt/huge/* + umount /mnt/huge + rmdir /mnt/huge + fi +} + +function run_test() { + local size=$(($1 * ${MB} * 1024 * 1024)) + local populate="$2" + local write="$3" + local cgroup_limit=$(($4 * ${MB} * 1024 * 1024)) + local reservation_limit=$(($5 * ${MB} * 1024 * 1024)) + local nr_hugepages="$6" + local method="$7" + local private="$8" + local expect_failure="$9" + local reserve="${10}" + + # Function return values. + hugetlb_difference=0 + reserved_difference=0 + reservation_failed=0 + oom_killed=0 + + echo nr hugepages = "$nr_hugepages" + echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages + + setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit" + + mkdir -p /mnt/huge + mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge + + write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \ + "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \ + "$reserve" + + cleanup_hugetlb_memory "hugetlb_cgroup_test" + + local final_hugetlb=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$fault_usage_file) + local final_reservation=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$reservation_usage_file) + + echo $hugetlb_difference + echo $reserved_difference + expect_equal "0" "$final_hugetlb" "final hugetlb is not zero" + expect_equal "0" "$final_reservation" "final reservation is not zero" +} + +function run_multiple_cgroup_test() { + local size1="$1" + local populate1="$2" + local write1="$3" + local cgroup_limit1="$4" + local reservation_limit1="$5" + + local size2="$6" + local populate2="$7" + local write2="$8" + local cgroup_limit2="$9" + local reservation_limit2="${10}" + + local nr_hugepages="${11}" + local method="${12}" + local private="${13}" + local expect_failure="${14}" + local reserve="${15}" + + # Function return values. + hugetlb_difference1=0 + reserved_difference1=0 + reservation_failed1=0 + oom_killed1=0 + + hugetlb_difference2=0 + reserved_difference2=0 + reservation_failed2=0 + oom_killed2=0 + + echo nr hugepages = "$nr_hugepages" + echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages + + setup_cgroup "hugetlb_cgroup_test1" "$cgroup_limit1" "$reservation_limit1" + setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2" + + mkdir -p /mnt/huge + mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge + + write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \ + "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \ + "$expect_failure" "$reserve" + + hugetlb_difference1=$hugetlb_difference + reserved_difference1=$reserved_difference + reservation_failed1=$reservation_failed + oom_killed1=$oom_killed + + local cgroup1_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$fault_usage_file + local cgroup1_reservation_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$reservation_usage_file + local cgroup2_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$fault_usage_file + local cgroup2_reservation_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$reservation_usage_file + + local usage_before_second_write=$(cat $cgroup1_hugetlb_usage) + local reservation_usage_before_second_write=$(cat $cgroup1_reservation_usage) + + write_hugetlbfs_and_get_usage "hugetlb_cgroup_test2" "$size2" \ + "$populate2" "$write2" "/mnt/huge/test2" "$method" "$private" \ + "$expect_failure" "$reserve" + + hugetlb_difference2=$hugetlb_difference + reserved_difference2=$reserved_difference + reservation_failed2=$reservation_failed + oom_killed2=$oom_killed + + expect_equal "$usage_before_second_write" \ + "$(cat $cgroup1_hugetlb_usage)" "Usage changed." + expect_equal "$reservation_usage_before_second_write" \ + "$(cat $cgroup1_reservation_usage)" "Reservation usage changed." + + cleanup_hugetlb_memory + + local final_hugetlb=$(cat $cgroup1_hugetlb_usage) + local final_reservation=$(cat $cgroup1_reservation_usage) + + expect_equal "0" "$final_hugetlb" \ + "hugetlbt_cgroup_test1 final hugetlb is not zero" + expect_equal "0" "$final_reservation" \ + "hugetlbt_cgroup_test1 final reservation is not zero" + + local final_hugetlb=$(cat $cgroup2_hugetlb_usage) + local final_reservation=$(cat $cgroup2_reservation_usage) + + expect_equal "0" "$final_hugetlb" \ + "hugetlb_cgroup_test2 final hugetlb is not zero" + expect_equal "0" "$final_reservation" \ + "hugetlb_cgroup_test2 final reservation is not zero" +} + +cleanup + +for populate in "" "-o"; do + for method in 0 1 2; do + for private in "" "-r"; do + for reserve in "" "-n"; do + + # Skip mmap(MAP_HUGETLB | MAP_SHARED). Doesn't seem to be supported. + if [[ "$method" == 1 ]] && [[ "$private" == "" ]]; then + continue + fi + + # Skip populated shmem tests. Doesn't seem to be supported. + if [[ "$method" == 2"" ]] && [[ "$populate" == "-o" ]]; then + continue + fi + + if [[ "$method" == 2"" ]] && [[ "$reserve" == "-n" ]]; then + continue + fi + + cleanup + echo + echo + echo + echo Test normal case. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + run_test 5 "$populate" "" 10 10 10 "$method" "$private" "0" "$reserve" + + echo Memory charged to hugtlb=$hugetlb_difference + echo Memory charged to reservation=$reserved_difference + + if [[ "$populate" == "-o" ]]; then + expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \ + "Reserved memory charged to hugetlb cgroup." + else + expect_equal "0" "$hugetlb_difference" \ + "Reserved memory charged to hugetlb cgroup." + fi + + if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then + expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \ + "Reserved memory not charged to reservation usage." + else + expect_equal "0" "$reserved_difference" \ + "Reserved memory not charged to reservation usage." + fi + + echo 'PASS' + + cleanup + echo + echo + echo + echo Test normal case with write. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + run_test 5 "$populate" '-w' 5 5 10 "$method" "$private" "0" "$reserve" + + echo Memory charged to hugtlb=$hugetlb_difference + echo Memory charged to reservation=$reserved_difference + + expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \ + "Reserved memory charged to hugetlb cgroup." + + expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \ + "Reserved memory not charged to reservation usage." + + echo 'PASS' + + cleanup + continue + echo + echo + echo + echo Test more than reservation case. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + + if [ "$reserve" != "-n" ]; then + run_test "5" "$populate" '' "10" "2" "10" "$method" "$private" "1" \ + "$reserve" + + expect_equal "1" "$reservation_failed" "Reservation succeeded." + fi + + echo 'PASS' + + cleanup + + echo + echo + echo + echo Test more than cgroup limit case. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + + # Not sure if shm memory can be cleaned up when the process gets sigbus'd. + if [[ "$method" != 2 ]]; then + run_test 5 "$populate" "-w" 2 10 10 "$method" "$private" "1" "$reserve" + + expect_equal "1" "$oom_killed" "Not oom killed." + fi + echo 'PASS' + + cleanup + + echo + echo + echo + echo Test normal case, multiple cgroups. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + run_multiple_cgroup_test "3" "$populate" "" "10" "10" "5" \ + "$populate" "" "10" "10" "10" \ + "$method" "$private" "0" "$reserve" + + echo Memory charged to hugtlb1=$hugetlb_difference1 + echo Memory charged to reservation1=$reserved_difference1 + echo Memory charged to hugtlb2=$hugetlb_difference2 + echo Memory charged to reservation2=$reserved_difference2 + + if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then + expect_equal "3" "$reserved_difference1" \ + "Incorrect reservations charged to cgroup 1." + + expect_equal "5" "$reserved_difference2" \ + "Incorrect reservation charged to cgroup 2." + + else + expect_equal "0" "$reserved_difference1" \ + "Incorrect reservations charged to cgroup 1." + + expect_equal "0" "$reserved_difference2" \ + "Incorrect reservation charged to cgroup 2." + fi + + if [[ "$populate" == "-o" ]]; then + expect_equal "3" "$hugetlb_difference1" \ + "Incorrect hugetlb charged to cgroup 1." + + expect_equal "5" "$hugetlb_difference2" \ + "Incorrect hugetlb charged to cgroup 2." + + else + expect_equal "0" "$hugetlb_difference1" \ + "Incorrect hugetlb charged to cgroup 1." + + expect_equal "0" "$hugetlb_difference2" \ + "Incorrect hugetlb charged to cgroup 2." + fi + echo 'PASS' + + cleanup + echo + echo + echo + echo Test normal case with write, multiple cgroups. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + run_multiple_cgroup_test "3" "$populate" "-w" "10" "10" "5" \ + "$populate" "-w" "10" "10" "10" \ + "$method" "$private" "0" "$reserve" + + echo Memory charged to hugtlb1=$hugetlb_difference1 + echo Memory charged to reservation1=$reserved_difference1 + echo Memory charged to hugtlb2=$hugetlb_difference2 + echo Memory charged to reservation2=$reserved_difference2 + + expect_equal "3" "$hugetlb_difference1" \ + "Incorrect hugetlb charged to cgroup 1." + + expect_equal "3" "$reserved_difference1" \ + "Incorrect reservation charged to cgroup 1." + + expect_equal "5" "$hugetlb_difference2" \ + "Incorrect hugetlb charged to cgroup 2." + + expect_equal "5" "$reserved_difference2" \ + "Incorrected reservation charged to cgroup 2." + echo 'PASS' + + cleanup + + done # reserve + done # private + done # populate +done # method + +if [[ $do_umount ]]; then + umount $cgroup_path + rmdir $cgroup_path +fi diff --git a/tools/testing/selftests/mm/check_config.sh b/tools/testing/selftests/mm/check_config.sh new file mode 100644 index 000000000000..bcba3af0acea --- /dev/null +++ b/tools/testing/selftests/mm/check_config.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Probe for libraries and create header files to record the results. Both C +# header files and Makefile include fragments are created. + +OUTPUT_H_FILE=local_config.h +OUTPUT_MKFILE=local_config.mk + +tmpname=$(mktemp) +tmpfile_c=${tmpname}.c +tmpfile_o=${tmpname}.o + +# liburing +echo "#include " > $tmpfile_c +echo "#include " >> $tmpfile_c +echo "int func(void) { return 0; }" >> $tmpfile_c + +CC=${1:?"Usage: $0 # example compiler: gcc"} +$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1 + +if [ -f $tmpfile_o ]; then + echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE + echo "COW_EXTRA_LIBS = -luring" > $OUTPUT_MKFILE +else + echo "// No liburing support found" > $OUTPUT_H_FILE + echo "# No liburing support found, so:" > $OUTPUT_MKFILE + echo "COW_EXTRA_LIBS = " >> $OUTPUT_MKFILE +fi + +rm ${tmpname}.* diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c new file mode 100644 index 000000000000..9b420140ba2b --- /dev/null +++ b/tools/testing/selftests/mm/compaction_test.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * A test for the patch "Allow compaction of unevictable pages". + * With this patch we should be able to allocate at least 1/4 + * of RAM in huge pages. Without the patch much less is + * allocated. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +#define MAP_SIZE_MB 100 +#define MAP_SIZE (MAP_SIZE_MB * 1024 * 1024) + +struct map_list { + void *map; + struct map_list *next; +}; + +int read_memory_info(unsigned long *memfree, unsigned long *hugepagesize) +{ + char buffer[256] = {0}; + char *cmd = "cat /proc/meminfo | grep -i memfree | grep -o '[0-9]*'"; + FILE *cmdfile = popen(cmd, "r"); + + if (!(fgets(buffer, sizeof(buffer), cmdfile))) { + perror("Failed to read meminfo\n"); + return -1; + } + + pclose(cmdfile); + + *memfree = atoll(buffer); + cmd = "cat /proc/meminfo | grep -i hugepagesize | grep -o '[0-9]*'"; + cmdfile = popen(cmd, "r"); + + if (!(fgets(buffer, sizeof(buffer), cmdfile))) { + perror("Failed to read meminfo\n"); + return -1; + } + + pclose(cmdfile); + *hugepagesize = atoll(buffer); + + return 0; +} + +int prereq(void) +{ + char allowed; + int fd; + + fd = open("/proc/sys/vm/compact_unevictable_allowed", + O_RDONLY | O_NONBLOCK); + if (fd < 0) { + perror("Failed to open\n" + "/proc/sys/vm/compact_unevictable_allowed\n"); + return -1; + } + + if (read(fd, &allowed, sizeof(char)) != sizeof(char)) { + perror("Failed to read from\n" + "/proc/sys/vm/compact_unevictable_allowed\n"); + close(fd); + return -1; + } + + close(fd); + if (allowed == '1') + return 0; + + return -1; +} + +int check_compaction(unsigned long mem_free, unsigned int hugepage_size) +{ + int fd; + int compaction_index = 0; + char initial_nr_hugepages[10] = {0}; + char nr_hugepages[10] = {0}; + + /* We want to test with 80% of available memory. Else, OOM killer comes + in to play */ + mem_free = mem_free * 0.8; + + fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK); + if (fd < 0) { + perror("Failed to open /proc/sys/vm/nr_hugepages"); + return -1; + } + + if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) { + perror("Failed to read from /proc/sys/vm/nr_hugepages"); + goto close_fd; + } + + /* Start with the initial condition of 0 huge pages*/ + if (write(fd, "0", sizeof(char)) != sizeof(char)) { + perror("Failed to write 0 to /proc/sys/vm/nr_hugepages\n"); + goto close_fd; + } + + lseek(fd, 0, SEEK_SET); + + /* Request a large number of huge pages. The Kernel will allocate + as much as it can */ + if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) { + perror("Failed to write 100000 to /proc/sys/vm/nr_hugepages\n"); + goto close_fd; + } + + lseek(fd, 0, SEEK_SET); + + if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) { + perror("Failed to re-read from /proc/sys/vm/nr_hugepages\n"); + goto close_fd; + } + + /* We should have been able to request at least 1/3 rd of the memory in + huge pages */ + compaction_index = mem_free/(atoi(nr_hugepages) * hugepage_size); + + if (compaction_index > 3) { + printf("No of huge pages allocated = %d\n", + (atoi(nr_hugepages))); + fprintf(stderr, "ERROR: Less that 1/%d of memory is available\n" + "as huge pages\n", compaction_index); + goto close_fd; + } + + printf("No of huge pages allocated = %d\n", + (atoi(nr_hugepages))); + + lseek(fd, 0, SEEK_SET); + + if (write(fd, initial_nr_hugepages, strlen(initial_nr_hugepages)) + != strlen(initial_nr_hugepages)) { + perror("Failed to write value to /proc/sys/vm/nr_hugepages\n"); + goto close_fd; + } + + close(fd); + return 0; + + close_fd: + close(fd); + printf("Not OK. Compaction test failed."); + return -1; +} + + +int main(int argc, char **argv) +{ + struct rlimit lim; + struct map_list *list, *entry; + size_t page_size, i; + void *map = NULL; + unsigned long mem_free = 0; + unsigned long hugepage_size = 0; + long mem_fragmentable_MB = 0; + + if (prereq() != 0) { + printf("Either the sysctl compact_unevictable_allowed is not\n" + "set to 1 or couldn't read the proc file.\n" + "Skipping the test\n"); + return KSFT_SKIP; + } + + lim.rlim_cur = RLIM_INFINITY; + lim.rlim_max = RLIM_INFINITY; + if (setrlimit(RLIMIT_MEMLOCK, &lim)) { + perror("Failed to set rlimit:\n"); + return -1; + } + + page_size = getpagesize(); + + list = NULL; + + if (read_memory_info(&mem_free, &hugepage_size) != 0) { + printf("ERROR: Cannot read meminfo\n"); + return -1; + } + + mem_fragmentable_MB = mem_free * 0.8 / 1024; + + while (mem_fragmentable_MB > 0) { + map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0); + if (map == MAP_FAILED) + break; + + entry = malloc(sizeof(struct map_list)); + if (!entry) { + munmap(map, MAP_SIZE); + break; + } + entry->map = map; + entry->next = list; + list = entry; + + /* Write something (in this case the address of the map) to + * ensure that KSM can't merge the mapped pages + */ + for (i = 0; i < MAP_SIZE; i += page_size) + *(unsigned long *)(map + i) = (unsigned long)map + i; + + mem_fragmentable_MB -= MAP_SIZE_MB; + } + + for (entry = list; entry != NULL; entry = entry->next) { + munmap(entry->map, MAP_SIZE); + if (!entry->next) + break; + entry = entry->next; + } + + if (check_compaction(mem_free, hugepage_size) == 0) + return 0; + + return -1; +} diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config new file mode 100644 index 000000000000..be087c4bc396 --- /dev/null +++ b/tools/testing/selftests/mm/config @@ -0,0 +1,8 @@ +CONFIG_SYSVIPC=y +CONFIG_USERFAULTFD=y +CONFIG_TEST_VMALLOC=m +CONFIG_DEVICE_PRIVATE=y +CONFIG_TEST_HMM=m +CONFIG_GUP_TEST=y +CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_MEM_SOFT_DIRTY=y diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c new file mode 100644 index 000000000000..16216d893d96 --- /dev/null +++ b/tools/testing/selftests/mm/cow.c @@ -0,0 +1,1764 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * COW (Copy On Write) tests. + * + * Copyright 2022, Red Hat, Inc. + * + * Author(s): David Hildenbrand + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "local_config.h" +#ifdef LOCAL_CONFIG_HAVE_LIBURING +#include +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ + +#include "../../../../mm/gup_test.h" +#include "../kselftest.h" +#include "vm_util.h" + +#ifndef MADV_COLLAPSE +#define MADV_COLLAPSE 25 +#endif + +static size_t pagesize; +static int pagemap_fd; +static size_t thpsize; +static int nr_hugetlbsizes; +static size_t hugetlbsizes[10]; +static int gup_fd; +static bool has_huge_zeropage; + +static void detect_thpsize(void) +{ + int fd = open("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", + O_RDONLY); + size_t size = 0; + char buf[15]; + int ret; + + if (fd < 0) + return; + + ret = pread(fd, buf, sizeof(buf), 0); + if (ret > 0 && ret < sizeof(buf)) { + buf[ret] = 0; + + size = strtoul(buf, NULL, 10); + if (size < pagesize) + size = 0; + if (size > 0) { + thpsize = size; + ksft_print_msg("[INFO] detected THP size: %zu KiB\n", + thpsize / 1024); + } + } + + close(fd); +} + +static void detect_huge_zeropage(void) +{ + int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page", + O_RDONLY); + size_t enabled = 0; + char buf[15]; + int ret; + + if (fd < 0) + return; + + ret = pread(fd, buf, sizeof(buf), 0); + if (ret > 0 && ret < sizeof(buf)) { + buf[ret] = 0; + + enabled = strtoul(buf, NULL, 10); + if (enabled == 1) { + has_huge_zeropage = true; + ksft_print_msg("[INFO] huge zeropage is enabled\n"); + } + } + + close(fd); +} + +static void detect_hugetlbsizes(void) +{ + DIR *dir = opendir("/sys/kernel/mm/hugepages/"); + + if (!dir) + return; + + while (nr_hugetlbsizes < ARRAY_SIZE(hugetlbsizes)) { + struct dirent *entry = readdir(dir); + size_t kb; + + if (!entry) + break; + if (entry->d_type != DT_DIR) + continue; + if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1) + continue; + hugetlbsizes[nr_hugetlbsizes] = kb * 1024; + nr_hugetlbsizes++; + ksft_print_msg("[INFO] detected hugetlb size: %zu KiB\n", + kb); + } + closedir(dir); +} + +static bool range_is_swapped(void *addr, size_t size) +{ + for (; size; addr += pagesize, size -= pagesize) + if (!pagemap_is_swapped(pagemap_fd, addr)) + return false; + return true; +} + +struct comm_pipes { + int child_ready[2]; + int parent_ready[2]; +}; + +static int setup_comm_pipes(struct comm_pipes *comm_pipes) +{ + if (pipe(comm_pipes->child_ready) < 0) + return -errno; + if (pipe(comm_pipes->parent_ready) < 0) { + close(comm_pipes->child_ready[0]); + close(comm_pipes->child_ready[1]); + return -errno; + } + + return 0; +} + +static void close_comm_pipes(struct comm_pipes *comm_pipes) +{ + close(comm_pipes->child_ready[0]); + close(comm_pipes->child_ready[1]); + close(comm_pipes->parent_ready[0]); + close(comm_pipes->parent_ready[1]); +} + +static int child_memcmp_fn(char *mem, size_t size, + struct comm_pipes *comm_pipes) +{ + char *old = malloc(size); + char buf; + + /* Backup the original content. */ + memcpy(old, mem, size); + + /* Wait until the parent modified the page. */ + write(comm_pipes->child_ready[1], "0", 1); + while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) + ; + + /* See if we still read the old values. */ + return memcmp(old, mem, size); +} + +static int child_vmsplice_memcmp_fn(char *mem, size_t size, + struct comm_pipes *comm_pipes) +{ + struct iovec iov = { + .iov_base = mem, + .iov_len = size, + }; + ssize_t cur, total, transferred; + char *old, *new; + int fds[2]; + char buf; + + old = malloc(size); + new = malloc(size); + + /* Backup the original content. */ + memcpy(old, mem, size); + + if (pipe(fds) < 0) + return -errno; + + /* Trigger a read-only pin. */ + transferred = vmsplice(fds[1], &iov, 1, 0); + if (transferred < 0) + return -errno; + if (transferred == 0) + return -EINVAL; + + /* Unmap it from our page tables. */ + if (munmap(mem, size) < 0) + return -errno; + + /* Wait until the parent modified it. */ + write(comm_pipes->child_ready[1], "0", 1); + while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) + ; + + /* See if we still read the old values via the pipe. */ + for (total = 0; total < transferred; total += cur) { + cur = read(fds[0], new + total, transferred - total); + if (cur < 0) + return -errno; + } + + return memcmp(old, new, transferred); +} + +typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes); + +static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, + child_fn fn) +{ + struct comm_pipes comm_pipes; + char buf; + int ret; + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + return; + } + + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_comm_pipes; + } else if (!ret) { + exit(fn(mem, size, &comm_pipes)); + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + + if (do_mprotect) { + /* + * mprotect() optimizations might try avoiding + * write-faults by directly mapping pages writable. + */ + ret = mprotect(mem, size, PROT_READ); + ret |= mprotect(mem, size, PROT_READ|PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + } + + /* Modify the page. */ + memset(mem, 0xff, size); + write(comm_pipes.parent_ready[1], "0", 1); + + wait(&ret); + if (WIFEXITED(ret)) + ret = WEXITSTATUS(ret); + else + ret = -EINVAL; + + ksft_test_result(!ret, "No leak from parent into child\n"); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +} + +static void test_cow_in_parent(char *mem, size_t size) +{ + do_test_cow_in_parent(mem, size, false, child_memcmp_fn); +} + +static void test_cow_in_parent_mprotect(char *mem, size_t size) +{ + do_test_cow_in_parent(mem, size, true, child_memcmp_fn); +} + +static void test_vmsplice_in_child(char *mem, size_t size) +{ + do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn); +} + +static void test_vmsplice_in_child_mprotect(char *mem, size_t size) +{ + do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn); +} + +static void do_test_vmsplice_in_parent(char *mem, size_t size, + bool before_fork) +{ + struct iovec iov = { + .iov_base = mem, + .iov_len = size, + }; + ssize_t cur, total, transferred; + struct comm_pipes comm_pipes; + char *old, *new; + int ret, fds[2]; + char buf; + + old = malloc(size); + new = malloc(size); + + memcpy(old, mem, size); + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + goto free; + } + + if (pipe(fds) < 0) { + ksft_test_result_fail("pipe() failed\n"); + goto close_comm_pipes; + } + + if (before_fork) { + transferred = vmsplice(fds[1], &iov, 1, 0); + if (transferred <= 0) { + ksft_test_result_fail("vmsplice() failed\n"); + goto close_pipe; + } + } + + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_pipe; + } else if (!ret) { + write(comm_pipes.child_ready[1], "0", 1); + while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) + ; + /* Modify page content in the child. */ + memset(mem, 0xff, size); + exit(0); + } + + if (!before_fork) { + transferred = vmsplice(fds[1], &iov, 1, 0); + if (transferred <= 0) { + ksft_test_result_fail("vmsplice() failed\n"); + wait(&ret); + goto close_pipe; + } + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + if (munmap(mem, size) < 0) { + ksft_test_result_fail("munmap() failed\n"); + goto close_pipe; + } + write(comm_pipes.parent_ready[1], "0", 1); + + /* Wait until the child is done writing. */ + wait(&ret); + if (!WIFEXITED(ret)) { + ksft_test_result_fail("wait() failed\n"); + goto close_pipe; + } + + /* See if we still read the old values. */ + for (total = 0; total < transferred; total += cur) { + cur = read(fds[0], new + total, transferred - total); + if (cur < 0) { + ksft_test_result_fail("read() failed\n"); + goto close_pipe; + } + } + + ksft_test_result(!memcmp(old, new, transferred), + "No leak from child into parent\n"); +close_pipe: + close(fds[0]); + close(fds[1]); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +free: + free(old); + free(new); +} + +static void test_vmsplice_before_fork(char *mem, size_t size) +{ + do_test_vmsplice_in_parent(mem, size, true); +} + +static void test_vmsplice_after_fork(char *mem, size_t size) +{ + do_test_vmsplice_in_parent(mem, size, false); +} + +#ifdef LOCAL_CONFIG_HAVE_LIBURING +static void do_test_iouring(char *mem, size_t size, bool use_fork) +{ + struct comm_pipes comm_pipes; + struct io_uring_cqe *cqe; + struct io_uring_sqe *sqe; + struct io_uring ring; + ssize_t cur, total; + struct iovec iov; + char *buf, *tmp; + int ret, fd; + FILE *file; + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + return; + } + + file = tmpfile(); + if (!file) { + ksft_test_result_fail("tmpfile() failed\n"); + goto close_comm_pipes; + } + fd = fileno(file); + assert(fd); + + tmp = malloc(size); + if (!tmp) { + ksft_test_result_fail("malloc() failed\n"); + goto close_file; + } + + /* Skip on errors, as we might just lack kernel support. */ + ret = io_uring_queue_init(1, &ring, 0); + if (ret < 0) { + ksft_test_result_skip("io_uring_queue_init() failed\n"); + goto free_tmp; + } + + /* + * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN + * | FOLL_LONGTERM the range. + * + * Skip on errors, as we might just lack kernel support or might not + * have sufficient MEMLOCK permissions. + */ + iov.iov_base = mem; + iov.iov_len = size; + ret = io_uring_register_buffers(&ring, &iov, 1); + if (ret) { + ksft_test_result_skip("io_uring_register_buffers() failed\n"); + goto queue_exit; + } + + if (use_fork) { + /* + * fork() and keep the child alive until we're done. Note that + * we expect the pinned page to not get shared with the child. + */ + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto unregister_buffers; + } else if (!ret) { + write(comm_pipes.child_ready[1], "0", 1); + while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) + ; + exit(0); + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + } else { + /* + * Map the page R/O into the page table. Enable softdirty + * tracking to stop the page from getting mapped R/W immediately + * again by mprotect() optimizations. Note that we don't have an + * easy way to test if that worked (the pagemap does not export + * if the page is mapped R/O vs. R/W). + */ + ret = mprotect(mem, size, PROT_READ); + clear_softdirty(); + ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto unregister_buffers; + } + } + + /* + * Modify the page and write page content as observed by the fixed + * buffer pin to the file so we can verify it. + */ + memset(mem, 0xff, size); + sqe = io_uring_get_sqe(&ring); + if (!sqe) { + ksft_test_result_fail("io_uring_get_sqe() failed\n"); + goto quit_child; + } + io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0); + + ret = io_uring_submit(&ring); + if (ret < 0) { + ksft_test_result_fail("io_uring_submit() failed\n"); + goto quit_child; + } + + ret = io_uring_wait_cqe(&ring, &cqe); + if (ret < 0) { + ksft_test_result_fail("io_uring_wait_cqe() failed\n"); + goto quit_child; + } + + if (cqe->res != size) { + ksft_test_result_fail("write_fixed failed\n"); + goto quit_child; + } + io_uring_cqe_seen(&ring, cqe); + + /* Read back the file content to the temporary buffer. */ + total = 0; + while (total < size) { + cur = pread(fd, tmp + total, size - total, total); + if (cur < 0) { + ksft_test_result_fail("pread() failed\n"); + goto quit_child; + } + total += cur; + } + + /* Finally, check if we read what we expected. */ + ksft_test_result(!memcmp(mem, tmp, size), + "Longterm R/W pin is reliable\n"); + +quit_child: + if (use_fork) { + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + } +unregister_buffers: + io_uring_unregister_buffers(&ring); +queue_exit: + io_uring_queue_exit(&ring); +free_tmp: + free(tmp); +close_file: + fclose(file); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +} + +static void test_iouring_ro(char *mem, size_t size) +{ + do_test_iouring(mem, size, false); +} + +static void test_iouring_fork(char *mem, size_t size) +{ + do_test_iouring(mem, size, true); +} + +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ + +enum ro_pin_test { + RO_PIN_TEST, + RO_PIN_TEST_SHARED, + RO_PIN_TEST_PREVIOUSLY_SHARED, + RO_PIN_TEST_RO_EXCLUSIVE, +}; + +static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test, + bool fast) +{ + struct pin_longterm_test args; + struct comm_pipes comm_pipes; + char *tmp, buf; + __u64 tmp_val; + int ret; + + if (gup_fd < 0) { + ksft_test_result_skip("gup_test not available\n"); + return; + } + + tmp = malloc(size); + if (!tmp) { + ksft_test_result_fail("malloc() failed\n"); + return; + } + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + goto free_tmp; + } + + switch (test) { + case RO_PIN_TEST: + break; + case RO_PIN_TEST_SHARED: + case RO_PIN_TEST_PREVIOUSLY_SHARED: + /* + * Share the pages with our child. As the pages are not pinned, + * this should just work. + */ + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_comm_pipes; + } else if (!ret) { + write(comm_pipes.child_ready[1], "0", 1); + while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) + ; + exit(0); + } + + /* Wait until our child is ready. */ + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + + if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) { + /* + * Tell the child to quit now and wait until it quit. + * The pages should now be mapped R/O into our page + * tables, but they are no longer shared. + */ + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + if (!WIFEXITED(ret)) + ksft_print_msg("[INFO] wait() failed\n"); + } + break; + case RO_PIN_TEST_RO_EXCLUSIVE: + /* + * Map the page R/O into the page table. Enable softdirty + * tracking to stop the page from getting mapped R/W immediately + * again by mprotect() optimizations. Note that we don't have an + * easy way to test if that worked (the pagemap does not export + * if the page is mapped R/O vs. R/W). + */ + ret = mprotect(mem, size, PROT_READ); + clear_softdirty(); + ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + /* Take a R/O pin. This should trigger unsharing. */ + args.addr = (__u64)(uintptr_t)mem; + args.size = size; + args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0; + ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args); + if (ret) { + if (errno == EINVAL) + ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n"); + else + ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n"); + goto wait; + } + + /* Modify the page. */ + memset(mem, 0xff, size); + + /* + * Read back the content via the pin to the temporary buffer and + * test if we observed the modification. + */ + tmp_val = (__u64)(uintptr_t)tmp; + ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val); + if (ret) + ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n"); + else + ksft_test_result(!memcmp(mem, tmp, size), + "Longterm R/O pin is reliable\n"); + + ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP); + if (ret) + ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n"); +wait: + switch (test) { + case RO_PIN_TEST_SHARED: + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + if (!WIFEXITED(ret)) + ksft_print_msg("[INFO] wait() failed\n"); + break; + default: + break; + } +close_comm_pipes: + close_comm_pipes(&comm_pipes); +free_tmp: + free(tmp); +} + +static void test_ro_pin_on_shared(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false); +} + +static void test_ro_fast_pin_on_shared(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true); +} + +static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false); +} + +static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true); +} + +static void test_ro_pin_on_ro_exclusive(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false); +} + +static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true); +} + +typedef void (*test_fn)(char *mem, size_t size); + +static void do_run_with_base_page(test_fn fn, bool swapout) +{ + char *mem; + int ret; + + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + ret = madvise(mem, pagesize, MADV_NOHUGEPAGE); + /* Ignore if not around on a kernel. */ + if (ret && errno != EINVAL) { + ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); + goto munmap; + } + + /* Populate a base page. */ + memset(mem, 0, pagesize); + + if (swapout) { + madvise(mem, pagesize, MADV_PAGEOUT); + if (!pagemap_is_swapped(pagemap_fd, mem)) { + ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); + goto munmap; + } + } + + fn(mem, pagesize); +munmap: + munmap(mem, pagesize); +} + +static void run_with_base_page(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with base page\n", desc); + do_run_with_base_page(fn, false); +} + +static void run_with_base_page_swap(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc); + do_run_with_base_page(fn, true); +} + +enum thp_run { + THP_RUN_PMD, + THP_RUN_PMD_SWAPOUT, + THP_RUN_PTE, + THP_RUN_PTE_SWAPOUT, + THP_RUN_SINGLE_PTE, + THP_RUN_SINGLE_PTE_SWAPOUT, + THP_RUN_PARTIAL_MREMAP, + THP_RUN_PARTIAL_SHARED, +}; + +static void do_run_with_thp(test_fn fn, enum thp_run thp_run) +{ + char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED; + size_t size, mmap_size, mremap_size; + int ret; + + /* For alignment purposes, we need twice the thp size. */ + mmap_size = 2 * thpsize; + mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mmap_mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + /* We need a THP-aligned memory area. */ + mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); + + ret = madvise(mem, thpsize, MADV_HUGEPAGE); + if (ret) { + ksft_test_result_fail("MADV_HUGEPAGE failed\n"); + goto munmap; + } + + /* + * Try to populate a THP. Touch the first sub-page and test if we get + * another sub-page populated automatically. + */ + mem[0] = 0; + if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) { + ksft_test_result_skip("Did not get a THP populated\n"); + goto munmap; + } + memset(mem, 0, thpsize); + + size = thpsize; + switch (thp_run) { + case THP_RUN_PMD: + case THP_RUN_PMD_SWAPOUT: + break; + case THP_RUN_PTE: + case THP_RUN_PTE_SWAPOUT: + /* + * Trigger PTE-mapping the THP by temporarily mapping a single + * subpage R/O. + */ + ret = mprotect(mem + pagesize, pagesize, PROT_READ); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + break; + case THP_RUN_SINGLE_PTE: + case THP_RUN_SINGLE_PTE_SWAPOUT: + /* + * Discard all but a single subpage of that PTE-mapped THP. What + * remains is a single PTE mapping a single subpage. + */ + ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED); + if (ret) { + ksft_test_result_fail("MADV_DONTNEED failed\n"); + goto munmap; + } + size = pagesize; + break; + case THP_RUN_PARTIAL_MREMAP: + /* + * Remap half of the THP. We need some new memory location + * for that. + */ + mremap_size = thpsize / 2; + mremap_mem = mmap(NULL, mremap_size, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + tmp = mremap(mem + mremap_size, mremap_size, mremap_size, + MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem); + if (tmp != mremap_mem) { + ksft_test_result_fail("mremap() failed\n"); + goto munmap; + } + size = mremap_size; + break; + case THP_RUN_PARTIAL_SHARED: + /* + * Share the first page of the THP with a child and quit the + * child. This will result in some parts of the THP never + * have been shared. + */ + ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto munmap; + } + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto munmap; + } else if (!ret) { + exit(0); + } + wait(&ret); + /* Allow for sharing all pages again. */ + ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK); + if (ret) { + ksft_test_result_fail("MADV_DOFORK failed\n"); + goto munmap; + } + break; + default: + assert(false); + } + + switch (thp_run) { + case THP_RUN_PMD_SWAPOUT: + case THP_RUN_PTE_SWAPOUT: + case THP_RUN_SINGLE_PTE_SWAPOUT: + madvise(mem, size, MADV_PAGEOUT); + if (!range_is_swapped(mem, size)) { + ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); + goto munmap; + } + break; + default: + break; + } + + fn(mem, size); +munmap: + munmap(mmap_mem, mmap_size); + if (mremap_mem != MAP_FAILED) + munmap(mremap_mem, mremap_size); +} + +static void run_with_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with THP\n", desc); + do_run_with_thp(fn, THP_RUN_PMD); +} + +static void run_with_thp_swap(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc); + do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT); +} + +static void run_with_pte_mapped_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc); + do_run_with_thp(fn, THP_RUN_PTE); +} + +static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc); + do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT); +} + +static void run_with_single_pte_of_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc); + do_run_with_thp(fn, THP_RUN_SINGLE_PTE); +} + +static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc); + do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT); +} + +static void run_with_partial_mremap_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc); + do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP); +} + +static void run_with_partial_shared_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc); + do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED); +} + +static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize) +{ + int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; + char *mem, *dummy; + + ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc, + hugetlbsize / 1024); + + flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT; + + mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_skip("need more free huge pages\n"); + return; + } + + /* Populate an huge page. */ + memset(mem, 0, hugetlbsize); + + /* + * We need a total of two hugetlb pages to handle COW/unsharing + * properly, otherwise we might get zapped by a SIGBUS. + */ + dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); + if (dummy == MAP_FAILED) { + ksft_test_result_skip("need more free huge pages\n"); + goto munmap; + } + munmap(dummy, hugetlbsize); + + fn(mem, hugetlbsize); +munmap: + munmap(mem, hugetlbsize); +} + +struct test_case { + const char *desc; + test_fn fn; +}; + +/* + * Test cases that are specific to anonymous pages: pages in private mappings + * that may get shared via COW during fork(). + */ +static const struct test_case anon_test_cases[] = { + /* + * Basic COW tests for fork() without any GUP. If we miss to break COW, + * either the child can observe modifications by the parent or the + * other way around. + */ + { + "Basic COW after fork()", + test_cow_in_parent, + }, + /* + * Basic test, but do an additional mprotect(PROT_READ)+ + * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. + */ + { + "Basic COW after fork() with mprotect() optimization", + test_cow_in_parent_mprotect, + }, + /* + * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If + * we miss to break COW, the child observes modifications by the parent. + * This is CVE-2020-29374 reported by Jann Horn. + */ + { + "vmsplice() + unmap in child", + test_vmsplice_in_child + }, + /* + * vmsplice() test, but do an additional mprotect(PROT_READ)+ + * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. + */ + { + "vmsplice() + unmap in child with mprotect() optimization", + test_vmsplice_in_child_mprotect + }, + /* + * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after + * fork(); modify in the child. If we miss to break COW, the parent + * observes modifications by the child. + */ + { + "vmsplice() before fork(), unmap in parent after fork()", + test_vmsplice_before_fork, + }, + /* + * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the + * child. If we miss to break COW, the parent observes modifications by + * the child. + */ + { + "vmsplice() + unmap in parent after fork()", + test_vmsplice_after_fork, + }, +#ifdef LOCAL_CONFIG_HAVE_LIBURING + /* + * Take a R/W longterm pin and then map the page R/O into the page + * table to trigger a write fault on next access. When modifying the + * page, the page content must be visible via the pin. + */ + { + "R/O-mapping a page registered as iouring fixed buffer", + test_iouring_ro, + }, + /* + * Take a R/W longterm pin and then fork() a child. When modifying the + * page, the page content must be visible via the pin. We expect the + * pinned page to not get shared with the child. + */ + { + "fork() with an iouring fixed buffer", + test_iouring_fork, + }, + +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ + /* + * Take a R/O longterm pin on a R/O-mapped shared anonymous page. + * When modifying the page via the page table, the page content change + * must be visible via the pin. + */ + { + "R/O GUP pin on R/O-mapped shared page", + test_ro_pin_on_shared, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O GUP-fast pin on R/O-mapped shared page", + test_ro_fast_pin_on_shared, + }, + /* + * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that + * was previously shared. When modifying the page via the page table, + * the page content change must be visible via the pin. + */ + { + "R/O GUP pin on R/O-mapped previously-shared page", + test_ro_pin_on_ro_previously_shared, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O GUP-fast pin on R/O-mapped previously-shared page", + test_ro_fast_pin_on_ro_previously_shared, + }, + /* + * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page. + * When modifying the page via the page table, the page content change + * must be visible via the pin. + */ + { + "R/O GUP pin on R/O-mapped exclusive page", + test_ro_pin_on_ro_exclusive, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O GUP-fast pin on R/O-mapped exclusive page", + test_ro_fast_pin_on_ro_exclusive, + }, +}; + +static void run_anon_test_case(struct test_case const *test_case) +{ + int i; + + run_with_base_page(test_case->fn, test_case->desc); + run_with_base_page_swap(test_case->fn, test_case->desc); + if (thpsize) { + run_with_thp(test_case->fn, test_case->desc); + run_with_thp_swap(test_case->fn, test_case->desc); + run_with_pte_mapped_thp(test_case->fn, test_case->desc); + run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc); + run_with_single_pte_of_thp(test_case->fn, test_case->desc); + run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc); + run_with_partial_mremap_thp(test_case->fn, test_case->desc); + run_with_partial_shared_thp(test_case->fn, test_case->desc); + } + for (i = 0; i < nr_hugetlbsizes; i++) + run_with_hugetlb(test_case->fn, test_case->desc, + hugetlbsizes[i]); +} + +static void run_anon_test_cases(void) +{ + int i; + + ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n"); + + for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++) + run_anon_test_case(&anon_test_cases[i]); +} + +static int tests_per_anon_test_case(void) +{ + int tests = 2 + nr_hugetlbsizes; + + if (thpsize) + tests += 8; + return tests; +} + +enum anon_thp_collapse_test { + ANON_THP_COLLAPSE_UNSHARED, + ANON_THP_COLLAPSE_FULLY_SHARED, + ANON_THP_COLLAPSE_LOWER_SHARED, + ANON_THP_COLLAPSE_UPPER_SHARED, +}; + +static void do_test_anon_thp_collapse(char *mem, size_t size, + enum anon_thp_collapse_test test) +{ + struct comm_pipes comm_pipes; + char buf; + int ret; + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + return; + } + + /* + * Trigger PTE-mapping the THP by temporarily mapping a single subpage + * R/O, such that we can try collapsing it later. + */ + ret = mprotect(mem + pagesize, pagesize, PROT_READ); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + /* Collapse before actually COW-sharing the page. */ + ret = madvise(mem, size, MADV_COLLAPSE); + if (ret) { + ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", + strerror(errno)); + goto close_comm_pipes; + } + break; + case ANON_THP_COLLAPSE_FULLY_SHARED: + /* COW-share the full PTE-mapped THP. */ + break; + case ANON_THP_COLLAPSE_LOWER_SHARED: + /* Don't COW-share the upper part of the THP. */ + ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto close_comm_pipes; + } + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + /* Don't COW-share the lower part of the THP. */ + ret = madvise(mem, size / 2, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_comm_pipes; + } else if (!ret) { + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + case ANON_THP_COLLAPSE_FULLY_SHARED: + exit(child_memcmp_fn(mem, size, &comm_pipes)); + break; + case ANON_THP_COLLAPSE_LOWER_SHARED: + exit(child_memcmp_fn(mem, size / 2, &comm_pipes)); + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + exit(child_memcmp_fn(mem + size / 2, size / 2, + &comm_pipes)); + break; + default: + assert(false); + } + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + case ANON_THP_COLLAPSE_LOWER_SHARED: + /* + * Revert MADV_DONTFORK such that we merge the VMAs and are + * able to actually collapse. + */ + ret = madvise(mem, size, MADV_DOFORK); + if (ret) { + ksft_test_result_fail("MADV_DOFORK failed\n"); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + /* FALLTHROUGH */ + case ANON_THP_COLLAPSE_FULLY_SHARED: + /* Collapse before anyone modified the COW-shared page. */ + ret = madvise(mem, size, MADV_COLLAPSE); + if (ret) { + ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", + strerror(errno)); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + /* Modify the page. */ + memset(mem, 0xff, size); + write(comm_pipes.parent_ready[1], "0", 1); + + wait(&ret); + if (WIFEXITED(ret)) + ret = WEXITSTATUS(ret); + else + ret = -EINVAL; + + ksft_test_result(!ret, "No leak from parent into child\n"); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +} + +static void test_anon_thp_collapse_unshared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED); +} + +static void test_anon_thp_collapse_fully_shared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED); +} + +static void test_anon_thp_collapse_lower_shared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED); +} + +static void test_anon_thp_collapse_upper_shared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED); +} + +/* + * Test cases that are specific to anonymous THP: pages in private mappings + * that may get shared via COW during fork(). + */ +static const struct test_case anon_thp_test_cases[] = { + /* + * Basic COW test for fork() without any GUP when collapsing a THP + * before fork(). + * + * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place + * collapse") might easily get COW handling wrong when not collapsing + * exclusivity information properly. + */ + { + "Basic COW after fork() when collapsing before fork()", + test_anon_thp_collapse_unshared, + }, + /* Basic COW test, but collapse after COW-sharing a full THP. */ + { + "Basic COW after fork() when collapsing after fork() (fully shared)", + test_anon_thp_collapse_fully_shared, + }, + /* + * Basic COW test, but collapse after COW-sharing the lower half of a + * THP. + */ + { + "Basic COW after fork() when collapsing after fork() (lower shared)", + test_anon_thp_collapse_lower_shared, + }, + /* + * Basic COW test, but collapse after COW-sharing the upper half of a + * THP. + */ + { + "Basic COW after fork() when collapsing after fork() (upper shared)", + test_anon_thp_collapse_upper_shared, + }, +}; + +static void run_anon_thp_test_cases(void) +{ + int i; + + if (!thpsize) + return; + + ksft_print_msg("[INFO] Anonymous THP tests\n"); + + for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) { + struct test_case const *test_case = &anon_thp_test_cases[i]; + + ksft_print_msg("[RUN] %s\n", test_case->desc); + do_run_with_thp(test_case->fn, THP_RUN_PMD); + } +} + +static int tests_per_anon_thp_test_case(void) +{ + return thpsize ? 1 : 0; +} + +typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size); + +static void test_cow(char *mem, const char *smem, size_t size) +{ + char *old = malloc(size); + + /* Backup the original content. */ + memcpy(old, smem, size); + + /* Modify the page. */ + memset(mem, 0xff, size); + + /* See if we still read the old values via the other mapping. */ + ksft_test_result(!memcmp(smem, old, size), + "Other mapping not modified\n"); + free(old); +} + +static void test_ro_pin(char *mem, const char *smem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST, false); +} + +static void test_ro_fast_pin(char *mem, const char *smem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST, true); +} + +static void run_with_zeropage(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, tmp; + + ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc); + + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Read from the page to populate the shared zeropage. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, pagesize); +munmap: + munmap(mem, pagesize); + if (smem != MAP_FAILED) + munmap(smem, pagesize); +} + +static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, *mmap_mem, *mmap_smem, tmp; + size_t mmap_size; + int ret; + + ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc); + + if (!has_huge_zeropage) { + ksft_test_result_skip("Huge zeropage not enabled\n"); + return; + } + + /* For alignment purposes, we need twice the thp size. */ + mmap_size = 2 * thpsize; + mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mmap_mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + mmap_smem = mmap(NULL, mmap_size, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mmap_smem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* We need a THP-aligned memory area. */ + mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); + smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1)); + + ret = madvise(mem, thpsize, MADV_HUGEPAGE); + ret |= madvise(smem, thpsize, MADV_HUGEPAGE); + if (ret) { + ksft_test_result_fail("MADV_HUGEPAGE failed\n"); + goto munmap; + } + + /* + * Read from the memory to populate the huge shared zeropage. Read from + * the first sub-page and test if we get another sub-page populated + * automatically. + */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + if (!pagemap_is_populated(pagemap_fd, mem + pagesize) || + !pagemap_is_populated(pagemap_fd, smem + pagesize)) { + ksft_test_result_skip("Did not get THPs populated\n"); + goto munmap; + } + + fn(mem, smem, thpsize); +munmap: + munmap(mmap_mem, mmap_size); + if (mmap_smem != MAP_FAILED) + munmap(mmap_smem, mmap_size); +} + +static void run_with_memfd(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, tmp; + int fd; + + ksft_print_msg("[RUN] %s ... with memfd\n", desc); + + fd = memfd_create("test", 0); + if (fd < 0) { + ksft_test_result_fail("memfd_create() failed\n"); + return; + } + + /* File consists of a single page filled with zeroes. */ + if (fallocate(fd, 0, 0, pagesize)) { + ksft_test_result_fail("fallocate() failed\n"); + goto close; + } + + /* Create a private mapping of the memfd. */ + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto close; + } + smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Fault the page in. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, pagesize); +munmap: + munmap(mem, pagesize); + if (smem != MAP_FAILED) + munmap(smem, pagesize); +close: + close(fd); +} + +static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, tmp; + FILE *file; + int fd; + + ksft_print_msg("[RUN] %s ... with tmpfile\n", desc); + + file = tmpfile(); + if (!file) { + ksft_test_result_fail("tmpfile() failed\n"); + return; + } + + fd = fileno(file); + if (fd < 0) { + ksft_test_result_skip("fileno() failed\n"); + return; + } + + /* File consists of a single page filled with zeroes. */ + if (fallocate(fd, 0, 0, pagesize)) { + ksft_test_result_fail("fallocate() failed\n"); + goto close; + } + + /* Create a private mapping of the memfd. */ + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto close; + } + smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Fault the page in. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, pagesize); +munmap: + munmap(mem, pagesize); + if (smem != MAP_FAILED) + munmap(smem, pagesize); +close: + fclose(file); +} + +static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, + size_t hugetlbsize) +{ + int flags = MFD_HUGETLB; + char *mem, *smem, tmp; + int fd; + + ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc, + hugetlbsize / 1024); + + flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT; + + fd = memfd_create("test", flags); + if (fd < 0) { + ksft_test_result_skip("memfd_create() failed\n"); + return; + } + + /* File consists of a single page filled with zeroes. */ + if (fallocate(fd, 0, 0, hugetlbsize)) { + ksft_test_result_skip("need more free huge pages\n"); + goto close; + } + + /* Create a private mapping of the memfd. */ + mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, + 0); + if (mem == MAP_FAILED) { + ksft_test_result_skip("need more free huge pages\n"); + goto close; + } + smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Fault the page in. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, hugetlbsize); +munmap: + munmap(mem, hugetlbsize); + if (mem != MAP_FAILED) + munmap(smem, hugetlbsize); +close: + close(fd); +} + +struct non_anon_test_case { + const char *desc; + non_anon_test_fn fn; +}; + +/* + * Test cases that target any pages in private mappings that are not anonymous: + * pages that may get shared via COW ndependent of fork(). This includes + * the shared zeropage(s), pagecache pages, ... + */ +static const struct non_anon_test_case non_anon_test_cases[] = { + /* + * Basic COW test without any GUP. If we miss to break COW, changes are + * visible via other private/shared mappings. + */ + { + "Basic COW", + test_cow, + }, + /* + * Take a R/O longterm pin. When modifying the page via the page table, + * the page content change must be visible via the pin. + */ + { + "R/O longterm GUP pin", + test_ro_pin, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O longterm GUP-fast pin", + test_ro_fast_pin, + }, +}; + +static void run_non_anon_test_case(struct non_anon_test_case const *test_case) +{ + int i; + + run_with_zeropage(test_case->fn, test_case->desc); + run_with_memfd(test_case->fn, test_case->desc); + run_with_tmpfile(test_case->fn, test_case->desc); + if (thpsize) + run_with_huge_zeropage(test_case->fn, test_case->desc); + for (i = 0; i < nr_hugetlbsizes; i++) + run_with_memfd_hugetlb(test_case->fn, test_case->desc, + hugetlbsizes[i]); +} + +static void run_non_anon_test_cases(void) +{ + int i; + + ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n"); + + for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++) + run_non_anon_test_case(&non_anon_test_cases[i]); +} + +static int tests_per_non_anon_test_case(void) +{ + int tests = 3 + nr_hugetlbsizes; + + if (thpsize) + tests += 1; + return tests; +} + +int main(int argc, char **argv) +{ + int err; + + pagesize = getpagesize(); + detect_thpsize(); + detect_hugetlbsizes(); + detect_huge_zeropage(); + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() + + ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() + + ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case()); + + gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + + run_anon_test_cases(); + run_anon_thp_test_cases(); + run_non_anon_test_cases(); + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + return ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c new file mode 100644 index 000000000000..e43879291dac --- /dev/null +++ b/tools/testing/selftests/mm/gup_test.c @@ -0,0 +1,271 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../kselftest.h" + +#include "util.h" + +#define MB (1UL << 20) + +/* Just the flags we need, copied from mm.h: */ +#define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_TOUCH 0x02 /* mark page accessed */ + +#define GUP_TEST_FILE "/sys/kernel/debug/gup_test" + +static unsigned long cmd = GUP_FAST_BENCHMARK; +static int gup_fd, repeats = 1; +static unsigned long size = 128 * MB; +/* Serialize prints */ +static pthread_mutex_t print_mutex = PTHREAD_MUTEX_INITIALIZER; + +static char *cmd_to_str(unsigned long cmd) +{ + switch (cmd) { + case GUP_FAST_BENCHMARK: + return "GUP_FAST_BENCHMARK"; + case PIN_FAST_BENCHMARK: + return "PIN_FAST_BENCHMARK"; + case PIN_LONGTERM_BENCHMARK: + return "PIN_LONGTERM_BENCHMARK"; + case GUP_BASIC_TEST: + return "GUP_BASIC_TEST"; + case PIN_BASIC_TEST: + return "PIN_BASIC_TEST"; + case DUMP_USER_PAGES_TEST: + return "DUMP_USER_PAGES_TEST"; + } + return "Unknown command"; +} + +void *gup_thread(void *data) +{ + struct gup_test gup = *(struct gup_test *)data; + int i; + + /* Only report timing information on the *_BENCHMARK commands: */ + if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) || + (cmd == PIN_LONGTERM_BENCHMARK)) { + for (i = 0; i < repeats; i++) { + gup.size = size; + if (ioctl(gup_fd, cmd, &gup)) + perror("ioctl"), exit(1); + + pthread_mutex_lock(&print_mutex); + printf("%s: Time: get:%lld put:%lld us", + cmd_to_str(cmd), gup.get_delta_usec, + gup.put_delta_usec); + if (gup.size != size) + printf(", truncated (size: %lld)", gup.size); + printf("\n"); + pthread_mutex_unlock(&print_mutex); + } + } else { + gup.size = size; + if (ioctl(gup_fd, cmd, &gup)) { + perror("ioctl"); + exit(1); + } + + pthread_mutex_lock(&print_mutex); + printf("%s: done\n", cmd_to_str(cmd)); + if (gup.size != size) + printf("Truncated (size: %lld)\n", gup.size); + pthread_mutex_unlock(&print_mutex); + } + + return NULL; +} + +int main(int argc, char **argv) +{ + struct gup_test gup = { 0 }; + int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret; + int flags = MAP_PRIVATE, touch = 0; + char *file = "/dev/zero"; + pthread_t *tid; + char *p; + + while ((opt = getopt(argc, argv, "m:r:n:F:f:abcj:tTLUuwWSHpz")) != -1) { + switch (opt) { + case 'a': + cmd = PIN_FAST_BENCHMARK; + break; + case 'b': + cmd = PIN_BASIC_TEST; + break; + case 'L': + cmd = PIN_LONGTERM_BENCHMARK; + break; + case 'c': + cmd = DUMP_USER_PAGES_TEST; + /* + * Dump page 0 (index 1). May be overridden later, by + * user's non-option arguments. + * + * .which_pages is zero-based, so that zero can mean "do + * nothing". + */ + gup.which_pages[0] = 1; + break; + case 'p': + /* works only with DUMP_USER_PAGES_TEST */ + gup.test_flags |= GUP_TEST_FLAG_DUMP_PAGES_USE_PIN; + break; + case 'F': + /* strtol, so you can pass flags in hex form */ + gup.gup_flags = strtol(optarg, 0, 0); + break; + case 'j': + nthreads = atoi(optarg); + break; + case 'm': + size = atoi(optarg) * MB; + break; + case 'r': + repeats = atoi(optarg); + break; + case 'n': + nr_pages = atoi(optarg); + break; + case 't': + thp = 1; + break; + case 'T': + thp = 0; + break; + case 'U': + cmd = GUP_BASIC_TEST; + break; + case 'u': + cmd = GUP_FAST_BENCHMARK; + break; + case 'w': + write = 1; + break; + case 'W': + write = 0; + break; + case 'f': + file = optarg; + break; + case 'S': + flags &= ~MAP_PRIVATE; + flags |= MAP_SHARED; + break; + case 'H': + flags |= (MAP_HUGETLB | MAP_ANONYMOUS); + break; + case 'z': + /* fault pages in gup, do not fault in userland */ + touch = 1; + break; + default: + return -1; + } + } + + if (optind < argc) { + int extra_arg_count = 0; + /* + * For example: + * + * ./gup_test -c 0 1 0x1001 + * + * ...to dump pages 0, 1, and 4097 + */ + + while ((optind < argc) && + (extra_arg_count < GUP_TEST_MAX_PAGES_TO_DUMP)) { + /* + * Do the 1-based indexing here, so that the user can + * use normal 0-based indexing on the command line. + */ + long page_index = strtol(argv[optind], 0, 0) + 1; + + gup.which_pages[extra_arg_count] = page_index; + extra_arg_count++; + optind++; + } + } + + filed = open(file, O_RDWR|O_CREAT); + if (filed < 0) { + perror("open"); + exit(filed); + } + + gup.nr_pages_per_call = nr_pages; + if (write) + gup.gup_flags |= FOLL_WRITE; + + gup_fd = open(GUP_TEST_FILE, O_RDWR); + if (gup_fd == -1) { + switch (errno) { + case EACCES: + if (getuid()) + printf("Please run this test as root\n"); + break; + case ENOENT: + if (opendir("/sys/kernel/debug") == NULL) { + printf("mount debugfs at /sys/kernel/debug\n"); + break; + } + printf("check if CONFIG_GUP_TEST is enabled in kernel config\n"); + break; + default: + perror("failed to open " GUP_TEST_FILE); + break; + } + exit(KSFT_SKIP); + } + + p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0); + if (p == MAP_FAILED) { + perror("mmap"); + exit(1); + } + gup.addr = (unsigned long)p; + + if (thp == 1) + madvise(p, size, MADV_HUGEPAGE); + else if (thp == 0) + madvise(p, size, MADV_NOHUGEPAGE); + + /* + * FOLL_TOUCH, in gup_test, is used as an either/or case: either + * fault pages in from the kernel via FOLL_TOUCH, or fault them + * in here, from user space. This allows comparison of performance + * between those two cases. + */ + if (touch) { + gup.gup_flags |= FOLL_TOUCH; + } else { + for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE) + p[0] = 0; + } + + tid = malloc(sizeof(pthread_t) * nthreads); + assert(tid); + for (i = 0; i < nthreads; i++) { + ret = pthread_create(&tid[i], NULL, gup_thread, &gup); + assert(ret == 0); + } + for (i = 0; i < nthreads; i++) { + ret = pthread_join(tid[i], NULL); + assert(ret == 0); + } + free(tid); + + return 0; +} diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c new file mode 100644 index 000000000000..4adaad1b822f --- /dev/null +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -0,0 +1,2054 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HMM stands for Heterogeneous Memory Management, it is a helper layer inside + * the linux kernel to help device drivers mirror a process address space in + * the device. This allows the device to use the same address space which + * makes communication and data exchange a lot easier. + * + * This framework's sole purpose is to exercise various code paths inside + * the kernel to make sure that HMM performs as expected and to flush out any + * bugs. + */ + +#include "../kselftest_harness.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * This is a private UAPI to the kernel test module so it isn't exported + * in the usual include/uapi/... directory. + */ +#include +#include + +struct hmm_buffer { + void *ptr; + void *mirror; + unsigned long size; + int fd; + uint64_t cpages; + uint64_t faults; +}; + +enum { + HMM_PRIVATE_DEVICE_ONE, + HMM_PRIVATE_DEVICE_TWO, + HMM_COHERENCE_DEVICE_ONE, + HMM_COHERENCE_DEVICE_TWO, +}; + +#define TWOMEG (1 << 21) +#define HMM_BUFFER_SIZE (1024 << 12) +#define HMM_PATH_MAX 64 +#define NTIMES 10 + +#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) +/* Just the flags we need, copied from mm.h: */ +#define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite */ + +FIXTURE(hmm) +{ + int fd; + unsigned int page_size; + unsigned int page_shift; +}; + +FIXTURE_VARIANT(hmm) +{ + int device_number; +}; + +FIXTURE_VARIANT_ADD(hmm, hmm_device_private) +{ + .device_number = HMM_PRIVATE_DEVICE_ONE, +}; + +FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent) +{ + .device_number = HMM_COHERENCE_DEVICE_ONE, +}; + +FIXTURE(hmm2) +{ + int fd0; + int fd1; + unsigned int page_size; + unsigned int page_shift; +}; + +FIXTURE_VARIANT(hmm2) +{ + int device_number0; + int device_number1; +}; + +FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private) +{ + .device_number0 = HMM_PRIVATE_DEVICE_ONE, + .device_number1 = HMM_PRIVATE_DEVICE_TWO, +}; + +FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent) +{ + .device_number0 = HMM_COHERENCE_DEVICE_ONE, + .device_number1 = HMM_COHERENCE_DEVICE_TWO, +}; + +static int hmm_open(int unit) +{ + char pathname[HMM_PATH_MAX]; + int fd; + + snprintf(pathname, sizeof(pathname), "/dev/hmm_dmirror%d", unit); + fd = open(pathname, O_RDWR, 0); + if (fd < 0) + fprintf(stderr, "could not open hmm dmirror driver (%s)\n", + pathname); + return fd; +} + +static bool hmm_is_coherent_type(int dev_num) +{ + return (dev_num >= HMM_COHERENCE_DEVICE_ONE); +} + +FIXTURE_SETUP(hmm) +{ + self->page_size = sysconf(_SC_PAGE_SIZE); + self->page_shift = ffs(self->page_size) - 1; + + self->fd = hmm_open(variant->device_number); + if (self->fd < 0 && hmm_is_coherent_type(variant->device_number)) + SKIP(exit(0), "DEVICE_COHERENT not available"); + ASSERT_GE(self->fd, 0); +} + +FIXTURE_SETUP(hmm2) +{ + self->page_size = sysconf(_SC_PAGE_SIZE); + self->page_shift = ffs(self->page_size) - 1; + + self->fd0 = hmm_open(variant->device_number0); + if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0)) + SKIP(exit(0), "DEVICE_COHERENT not available"); + ASSERT_GE(self->fd0, 0); + self->fd1 = hmm_open(variant->device_number1); + ASSERT_GE(self->fd1, 0); +} + +FIXTURE_TEARDOWN(hmm) +{ + int ret = close(self->fd); + + ASSERT_EQ(ret, 0); + self->fd = -1; +} + +FIXTURE_TEARDOWN(hmm2) +{ + int ret = close(self->fd0); + + ASSERT_EQ(ret, 0); + self->fd0 = -1; + + ret = close(self->fd1); + ASSERT_EQ(ret, 0); + self->fd1 = -1; +} + +static int hmm_dmirror_cmd(int fd, + unsigned long request, + struct hmm_buffer *buffer, + unsigned long npages) +{ + struct hmm_dmirror_cmd cmd; + int ret; + + /* Simulate a device reading system memory. */ + cmd.addr = (__u64)buffer->ptr; + cmd.ptr = (__u64)buffer->mirror; + cmd.npages = npages; + + for (;;) { + ret = ioctl(fd, request, &cmd); + if (ret == 0) + break; + if (errno == EINTR) + continue; + return -errno; + } + buffer->cpages = cmd.cpages; + buffer->faults = cmd.faults; + + return 0; +} + +static void hmm_buffer_free(struct hmm_buffer *buffer) +{ + if (buffer == NULL) + return; + + if (buffer->ptr) + munmap(buffer->ptr, buffer->size); + free(buffer->mirror); + free(buffer); +} + +/* + * Create a temporary file that will be deleted on close. + */ +static int hmm_create_file(unsigned long size) +{ + char path[HMM_PATH_MAX]; + int fd; + + strcpy(path, "/tmp"); + fd = open(path, O_TMPFILE | O_EXCL | O_RDWR, 0600); + if (fd >= 0) { + int r; + + do { + r = ftruncate(fd, size); + } while (r == -1 && errno == EINTR); + if (!r) + return fd; + close(fd); + } + return -1; +} + +/* + * Return a random unsigned number. + */ +static unsigned int hmm_random(void) +{ + static int fd = -1; + unsigned int r; + + if (fd < 0) { + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + fprintf(stderr, "%s:%d failed to open /dev/urandom\n", + __FILE__, __LINE__); + return ~0U; + } + } + read(fd, &r, sizeof(r)); + return r; +} + +static void hmm_nanosleep(unsigned int n) +{ + struct timespec t; + + t.tv_sec = 0; + t.tv_nsec = n; + nanosleep(&t, NULL); +} + +static int hmm_migrate_sys_to_dev(int fd, + struct hmm_buffer *buffer, + unsigned long npages) +{ + return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages); +} + +static int hmm_migrate_dev_to_sys(int fd, + struct hmm_buffer *buffer, + unsigned long npages) +{ + return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages); +} + +/* + * Simple NULL test of device open/close. + */ +TEST_F(hmm, open_close) +{ +} + +/* + * Read private anonymous memory. + */ +TEST_F(hmm, anon_read) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int val; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* + * Initialize buffer in system memory but leave the first two pages + * zero (pte_none and pfn_zero). + */ + i = 2 * self->page_size / sizeof(*ptr); + for (ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Set buffer permission to read-only. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Populate the CPU page table with a special zero page. */ + val = *(int *)(buffer->ptr + self->page_size); + ASSERT_EQ(val, 0); + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + ptr = buffer->mirror; + for (i = 0; i < 2 * self->page_size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + for (; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Read private anonymous memory which has been protected with + * mprotect() PROT_NONE. + */ +TEST_F(hmm, anon_read_prot) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize mirror buffer so we can verify it isn't written. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + /* Protect buffer from reading. */ + ret = mprotect(buffer->ptr, size, PROT_NONE); + ASSERT_EQ(ret, 0); + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, -EFAULT); + + /* Allow CPU to read the buffer so we can check it. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + hmm_buffer_free(buffer); +} + +/* + * Write private anonymous memory. + */ +TEST_F(hmm, anon_write) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Write private anonymous memory which has been protected with + * mprotect() PROT_READ. + */ +TEST_F(hmm, anon_write_prot) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device reading a zero page of memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, 1); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 1); + ASSERT_EQ(buffer->faults, 1); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, -EPERM); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + /* Now allow writing and see that the zero page is replaced. */ + ret = mprotect(buffer->ptr, size, PROT_WRITE | PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Check that a device writing an anonymous private mapping + * will copy-on-write if a child process inherits the mapping. + */ +TEST_F(hmm, anon_write_child) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + pid_t pid; + int child_fd; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer->ptr so we can tell if it is written. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (pid != 0) { + waitpid(pid, &ret, 0); + ASSERT_EQ(WIFEXITED(ret), 1); + + /* Check that the parent's buffer did not change. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + return; + } + + /* Check that we see the parent's values. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + /* The child process needs its own mirror to its own mm. */ + child_fd = hmm_open(0); + ASSERT_GE(child_fd, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + close(child_fd); + exit(0); +} + +/* + * Check that a device writing an anonymous shared mapping + * will not copy-on-write if a child process inherits the mapping. + */ +TEST_F(hmm, anon_write_child_shared) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + pid_t pid; + int child_fd; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer->ptr so we can tell if it is written. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (pid != 0) { + waitpid(pid, &ret, 0); + ASSERT_EQ(WIFEXITED(ret), 1); + + /* Check that the parent's buffer did change. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + return; + } + + /* Check that we see the parent's values. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + /* The child process needs its own mirror to its own mm. */ + child_fd = hmm_open(0); + ASSERT_GE(child_fd, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + close(child_fd); + exit(0); +} + +/* + * Write private anonymous huge page. + */ +TEST_F(hmm, anon_write_huge) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = 2 * TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + size = TWOMEG; + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Read numeric data from raw and tagged kernel status files. Used to read + * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag). + */ +static long file_read_ulong(char *file, const char *tag) +{ + int fd; + char buf[2048]; + int len; + char *p, *q; + long val; + + fd = open(file, O_RDONLY); + if (fd < 0) { + /* Error opening the file */ + return -1; + } + + len = read(fd, buf, sizeof(buf)); + close(fd); + if (len < 0) { + /* Error in reading the file */ + return -1; + } + if (len == sizeof(buf)) { + /* Error file is too large */ + return -1; + } + buf[len] = '\0'; + + /* Search for a tag if provided */ + if (tag) { + p = strstr(buf, tag); + if (!p) + return -1; /* looks like the line we want isn't there */ + p += strlen(tag); + } else + p = buf; + + val = strtol(p, &q, 0); + if (*q != ' ') { + /* Error parsing the file */ + return -1; + } + + return val; +} + +/* + * Write huge TLBFS page. + */ +TEST_F(hmm, anon_write_hugetlbfs) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long default_hsize; + unsigned long i; + int *ptr; + int ret; + + default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); + if (default_hsize < 0 || default_hsize*1024 < default_hsize) + SKIP(return, "Huge page size could not be determined"); + default_hsize = default_hsize*1024; /* KB to B */ + + size = ALIGN(TWOMEG, default_hsize); + npages = size >> self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (buffer->ptr == MAP_FAILED) { + free(buffer); + SKIP(return, "Huge page could not be allocated"); + } + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + munmap(buffer->ptr, buffer->size); + buffer->ptr = NULL; + hmm_buffer_free(buffer); +} + +/* + * Read mmap'ed file memory. + */ +TEST_F(hmm, file_read) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int fd; + ssize_t len; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + fd = hmm_create_file(size); + ASSERT_GE(fd, 0); + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = fd; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Write initial contents of the file. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + len = pwrite(fd, buffer->mirror, size, 0); + ASSERT_EQ(len, size); + memset(buffer->mirror, 0, size); + + buffer->ptr = mmap(NULL, size, + PROT_READ, + MAP_SHARED, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Write mmap'ed file memory. + */ +TEST_F(hmm, file_write) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int fd; + ssize_t len; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + fd = hmm_create_file(size); + ASSERT_GE(fd, 0); + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = fd; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Check that the device also wrote the file. */ + len = pread(fd, buffer->mirror, size, 0); + ASSERT_EQ(len, size); + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous memory to device private memory. + */ +TEST_F(hmm, migrate) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous memory to device private memory and fault some of it back + * to system memory, then try migrating the resulting mix of system and device + * private memory to the device. + */ +TEST_F(hmm, migrate_fault) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Fault half the pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) + ASSERT_EQ(ptr[i], i); + + /* Migrate memory to the device again. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +TEST_F(hmm, migrate_release) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Release device memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages); + ASSERT_EQ(ret, 0); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous shared memory to device private memory. + */ +TEST_F(hmm, migrate_shared) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, -ENOENT); + + hmm_buffer_free(buffer); +} + +/* + * Try to migrate various memory types to device private memory. + */ +TEST_F(hmm2, migrate_mixed) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + int *ptr; + unsigned char *p; + int ret; + int val; + + npages = 6; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + p = buffer->ptr; + + /* Migrating a protected area should be an error. */ + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); + ASSERT_EQ(ret, -EINVAL); + + /* Punch a hole after the first page address. */ + ret = munmap(buffer->ptr + self->page_size, self->page_size); + ASSERT_EQ(ret, 0); + + /* We expect an error if the vma doesn't cover the range. */ + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 3); + ASSERT_EQ(ret, -EINVAL); + + /* Page 2 will be a read-only zero page. */ + ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 2 * self->page_size); + val = *ptr + 3; + ASSERT_EQ(val, 3); + + /* Page 3 will be read-only. */ + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 3 * self->page_size); + *ptr = val; + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + + /* Page 4-5 will be read-write. */ + ret = mprotect(buffer->ptr + 4 * self->page_size, 2 * self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 4 * self->page_size); + *ptr = val; + ptr = (int *)(buffer->ptr + 5 * self->page_size); + *ptr = val; + + /* Now try to migrate pages 2-5 to device 1. */ + buffer->ptr = p + 2 * self->page_size; + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 4); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 4); + + /* Page 5 won't be migrated to device 0 because it's on device 1. */ + buffer->ptr = p + 5 * self->page_size; + ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); + ASSERT_EQ(ret, -ENOENT); + buffer->ptr = p; + + buffer->ptr = p; + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous memory to device memory and back to system memory + * multiple times. In case of private zone configuration, this is done + * through fault pages accessed by CPU. In case of coherent zone configuration, + * the pages from the device should be explicitly migrated back to system memory. + * The reason is Coherent device zone has coherent access by CPU, therefore + * it will not generate any page fault. + */ +TEST_F(hmm, migrate_multiple) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + unsigned long c; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + for (c = 0; c < NTIMES; c++) { + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Migrate back to system memory and check them. */ + if (hmm_is_coherent_type(variant->device_number)) { + ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + } + + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); + } +} + +/* + * Read anonymous memory multiple times. + */ +TEST_F(hmm, anon_read_multiple) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + unsigned long c; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + for (c = 0; c < NTIMES; c++) { + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i + c; + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, + npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i + c); + + hmm_buffer_free(buffer); + } +} + +void *unmap_buffer(void *p) +{ + struct hmm_buffer *buffer = p; + + /* Delay for a bit and then unmap buffer while it is being read. */ + hmm_nanosleep(hmm_random() % 32000); + munmap(buffer->ptr + buffer->size / 2, buffer->size / 2); + buffer->ptr = NULL; + + return NULL; +} + +/* + * Try reading anonymous memory while it is being unmapped. + */ +TEST_F(hmm, anon_teardown) +{ + unsigned long npages; + unsigned long size; + unsigned long c; + void *ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + for (c = 0; c < NTIMES; ++c) { + pthread_t thread; + struct hmm_buffer *buffer; + unsigned long i; + int *ptr; + int rc; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i + c; + + rc = pthread_create(&thread, NULL, unmap_buffer, buffer); + ASSERT_EQ(rc, 0); + + /* Simulate a device reading system memory. */ + rc = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, + npages); + if (rc == 0) { + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; + i < size / sizeof(*ptr); + ++i) + ASSERT_EQ(ptr[i], i + c); + } + + pthread_join(thread, &ret); + hmm_buffer_free(buffer); + } +} + +/* + * Test memory snapshot without faulting in pages accessed by the device. + */ +TEST_F(hmm, mixedmap) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned char *m; + int ret; + + npages = 1; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, + self->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device snapshotting CPU pagetables. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ); + + hmm_buffer_free(buffer); +} + +/* + * Test memory snapshot without faulting in pages accessed by the device. + */ +TEST_F(hmm2, snapshot) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + int *ptr; + unsigned char *p; + unsigned char *m; + int ret; + int val; + + npages = 7; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + p = buffer->ptr; + + /* Punch a hole after the first page address. */ + ret = munmap(buffer->ptr + self->page_size, self->page_size); + ASSERT_EQ(ret, 0); + + /* Page 2 will be read-only zero page. */ + ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 2 * self->page_size); + val = *ptr + 3; + ASSERT_EQ(val, 3); + + /* Page 3 will be read-only. */ + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 3 * self->page_size); + *ptr = val; + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + + /* Page 4-6 will be read-write. */ + ret = mprotect(buffer->ptr + 4 * self->page_size, 3 * self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 4 * self->page_size); + *ptr = val; + + /* Page 5 will be migrated to device 0. */ + buffer->ptr = p + 5 * self->page_size; + ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 1); + + /* Page 6 will be migrated to device 1. */ + buffer->ptr = p + 6 * self->page_size; + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 1); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 1); + + /* Simulate a device snapshotting CPU pagetables. */ + buffer->ptr = p; + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + ASSERT_EQ(m[0], HMM_DMIRROR_PROT_ERROR); + ASSERT_EQ(m[1], HMM_DMIRROR_PROT_ERROR); + ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ); + ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ); + ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE); + if (!hmm_is_coherent_type(variant->device_number0)) { + ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | + HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE); + } else { + ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | + HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[6], HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE | + HMM_DMIRROR_PROT_WRITE); + } + + hmm_buffer_free(buffer); +} + +/* + * Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that + * should be mapped by a large page table entry. + */ +TEST_F(hmm, compound) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long default_hsize; + int *ptr; + unsigned char *m; + int ret; + unsigned long i; + + /* Skip test if we can't allocate a hugetlbfs page. */ + + default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); + if (default_hsize < 0 || default_hsize*1024 < default_hsize) + SKIP(return, "Huge page size could not be determined"); + default_hsize = default_hsize*1024; /* KB to B */ + + size = ALIGN(TWOMEG, default_hsize); + npages = size >> self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (buffer->ptr == MAP_FAILED) { + free(buffer); + return; + } + + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + /* Initialize the pages the device will snapshot in buffer->ptr. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device snapshotting CPU pagetables. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + for (i = 0; i < npages; ++i) + ASSERT_EQ(m[i], HMM_DMIRROR_PROT_WRITE | + HMM_DMIRROR_PROT_PMD); + + /* Make the region read-only. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate a device snapshotting CPU pagetables. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + for (i = 0; i < npages; ++i) + ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ | + HMM_DMIRROR_PROT_PMD); + + munmap(buffer->ptr, buffer->size); + buffer->ptr = NULL; + hmm_buffer_free(buffer); +} + +/* + * Test two devices reading the same memory (double mapped). + */ +TEST_F(hmm2, double_map) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = 6; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Make region read-only. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate device 0 reading system memory. */ + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Simulate device 1 reading system memory. */ + ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Migrate pages to device 1 and try to read from device 0. */ + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what device 0 read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Basic check of exclusive faulting. + */ +TEST_F(hmm, exclusive) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Map memory exclusively for device access. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i]++, i); + + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i+1); + + /* Check atomic access revoked */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_CHECK_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + + hmm_buffer_free(buffer); +} + +TEST_F(hmm, exclusive_mprotect) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Map memory exclusively for device access. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, -EPERM); + + hmm_buffer_free(buffer); +} + +/* + * Check copy-on-write works. + */ +TEST_F(hmm, exclusive_cow) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Map memory exclusively for device access. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + fork(); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i]++, i); + + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i+1); + + hmm_buffer_free(buffer); +} + +static int gup_test_exec(int gup_fd, unsigned long addr, int cmd, + int npages, int size, int flags) +{ + struct gup_test gup = { + .nr_pages_per_call = npages, + .addr = addr, + .gup_flags = FOLL_WRITE | flags, + .size = size, + }; + + if (ioctl(gup_fd, cmd, &gup)) { + perror("ioctl on error\n"); + return errno; + } + + return 0; +} + +/* + * Test get user device pages through gup_test. Setting PIN_LONGTERM flag. + * This should trigger a migration back to system memory for both, private + * and coherent type pages. + * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added + * to your configuration before you run it. + */ +TEST_F(hmm, hmm_gup_test) +{ + struct hmm_buffer *buffer; + int gup_fd; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + unsigned char *m; + + gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + if (gup_fd == -1) + SKIP(return, "Skipping test, could not find gup_test driver"); + + npages = 4; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr, + GUP_BASIC_TEST, 1, self->page_size, 0), 0); + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr + 1 * self->page_size, + GUP_FAST_BENCHMARK, 1, self->page_size, 0), 0); + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr + 2 * self->page_size, + PIN_FAST_BENCHMARK, 1, self->page_size, FOLL_LONGTERM), 0); + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr + 3 * self->page_size, + PIN_LONGTERM_BENCHMARK, 1, self->page_size, 0), 0); + + /* Take snapshot to CPU pagetables */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + m = buffer->mirror; + if (hmm_is_coherent_type(variant->device_number)) { + ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[0]); + ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[1]); + } else { + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]); + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]); + } + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[2]); + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[3]); + /* + * Check again the content on the pages. Make sure there's no + * corrupted data. + */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + close(gup_fd); + hmm_buffer_free(buffer); +} + +/* + * Test copy-on-write in device pages. + * In case of writing to COW private page(s), a page fault will migrate pages + * back to system memory first. Then, these pages will be duplicated. In case + * of COW device coherent type, pages are duplicated directly from device + * memory. + */ +TEST_F(hmm, hmm_cow_in_device) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + unsigned char *m; + pid_t pid; + int status; + + npages = 4; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (!pid) { + /* Child process waitd for SIGTERM from the parent. */ + while (1) { + } + perror("Should not reach this\n"); + exit(0); + } + /* Parent process writes to COW pages(s) and gets a + * new copy in system. In case of device private pages, + * this write causes a migration to system mem first. + */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Terminate child and wait */ + EXPECT_EQ(0, kill(pid, SIGTERM)); + EXPECT_EQ(pid, waitpid(pid, &status, 0)); + EXPECT_NE(0, WIFSIGNALED(status)); + EXPECT_EQ(SIGTERM, WTERMSIG(status)); + + /* Take snapshot to CPU pagetables */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + m = buffer->mirror; + for (i = 0; i < npages; i++) + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]); + + hmm_buffer_free(buffer); +} +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/hugepage-mmap.c b/tools/testing/selftests/mm/hugepage-mmap.c new file mode 100644 index 000000000000..955ef87f382c --- /dev/null +++ b/tools/testing/selftests/mm/hugepage-mmap.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugepage-mmap: + * + * Example of using huge page memory in a user application using the mmap + * system call. Before running this application, make sure that the + * administrator has mounted the hugetlbfs filesystem (on some directory + * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this + * example, the app is requesting memory of size 256MB that is backed by + * huge pages. + * + * For the ia64 architecture, the Linux kernel reserves Region number 4 for + * huge pages. That means that if one requires a fixed address, a huge page + * aligned address starting with 0x800000... will be required. If a fixed + * address is not required, the kernel will select an address in the proper + * range. + * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +#define LENGTH (256UL*1024*1024) +#define PROTECTION (PROT_READ | PROT_WRITE) + +/* Only ia64 requires this */ +#ifdef __ia64__ +#define ADDR (void *)(0x8000000000000000UL) +#define FLAGS (MAP_SHARED | MAP_FIXED) +#else +#define ADDR (void *)(0x0UL) +#define FLAGS (MAP_SHARED) +#endif + +static void check_bytes(char *addr) +{ + printf("First hex is %x\n", *((unsigned int *)addr)); +} + +static void write_bytes(char *addr) +{ + unsigned long i; + + for (i = 0; i < LENGTH; i++) + *(addr + i) = (char)i; +} + +static int read_bytes(char *addr) +{ + unsigned long i; + + check_bytes(addr); + for (i = 0; i < LENGTH; i++) + if (*(addr + i) != (char)i) { + printf("Mismatch at %lu\n", i); + return 1; + } + return 0; +} + +int main(void) +{ + void *addr; + int fd, ret; + + fd = memfd_create("hugepage-mmap", MFD_HUGETLB); + if (fd < 0) { + perror("memfd_create() failed"); + exit(1); + } + + addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + close(fd); + exit(1); + } + + printf("Returned address is %p\n", addr); + check_bytes(addr); + write_bytes(addr); + ret = read_bytes(addr); + + munmap(addr, LENGTH); + close(fd); + + return ret; +} diff --git a/tools/testing/selftests/mm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c new file mode 100644 index 000000000000..e53b5eaa8fce --- /dev/null +++ b/tools/testing/selftests/mm/hugepage-mremap.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugepage-mremap: + * + * Example of remapping huge page memory in a user application using the + * mremap system call. The path to a file in a hugetlbfs filesystem must + * be passed as the last argument to this test. The amount of memory used + * by this test in MBs can optionally be passed as an argument. If no memory + * amount is passed, the default amount is 10MB. + * + * To make sure the test triggers pmd sharing and goes through the 'unshare' + * path in the mremap code use 1GB (1024) or more. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include /* Definition of O_* constants */ +#include /* Definition of SYS_* constants */ +#include +#include +#include + +#define DEFAULT_LENGTH_MB 10UL +#define MB_TO_BYTES(x) (x * 1024 * 1024) + +#define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC) +#define FLAGS (MAP_SHARED | MAP_ANONYMOUS) + +static void check_bytes(char *addr) +{ + printf("First hex is %x\n", *((unsigned int *)addr)); +} + +static void write_bytes(char *addr, size_t len) +{ + unsigned long i; + + for (i = 0; i < len; i++) + *(addr + i) = (char)i; +} + +static int read_bytes(char *addr, size_t len) +{ + unsigned long i; + + check_bytes(addr); + for (i = 0; i < len; i++) + if (*(addr + i) != (char)i) { + printf("Mismatch at %lu\n", i); + return 1; + } + return 0; +} + +static void register_region_with_uffd(char *addr, size_t len) +{ + long uffd; /* userfaultfd file descriptor */ + struct uffdio_api uffdio_api; + struct uffdio_register uffdio_register; + + /* Create and enable userfaultfd object. */ + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd == -1) { + perror("userfaultfd"); + exit(1); + } + + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { + perror("ioctl-UFFDIO_API"); + exit(1); + } + + /* Create a private anonymous mapping. The memory will be + * demand-zero paged--that is, not yet allocated. When we + * actually touch the memory, it will be allocated via + * the userfaultfd. + */ + + addr = mmap(NULL, len, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + printf("Address returned by mmap() = %p\n", addr); + + /* Register the memory range of the mapping we just created for + * handling by the userfaultfd object. In mode, we request to track + * missing pages (i.e., pages that have not yet been faulted in). + */ + + uffdio_register.range.start = (unsigned long)addr; + uffdio_register.range.len = len; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { + perror("ioctl-UFFDIO_REGISTER"); + exit(1); + } +} + +int main(int argc, char *argv[]) +{ + size_t length = 0; + int ret = 0, fd; + + if (argc >= 2 && !strcmp(argv[1], "-h")) { + printf("Usage: %s [length_in_MB]\n", argv[0]); + exit(1); + } + + /* Read memory length as the first arg if valid, otherwise fallback to + * the default length. + */ + if (argc >= 2) + length = (size_t)atoi(argv[1]); + else + length = DEFAULT_LENGTH_MB; + + length = MB_TO_BYTES(length); + fd = memfd_create(argv[0], MFD_HUGETLB); + if (fd < 0) { + perror("Open failed"); + exit(1); + } + + /* mmap to a PUD aligned address to hopefully trigger pmd sharing. */ + unsigned long suggested_addr = 0x7eaa40000000; + void *haddr = mmap((void *)suggested_addr, length, PROTECTION, + MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); + printf("Map haddr: Returned address is %p\n", haddr); + if (haddr == MAP_FAILED) { + perror("mmap1"); + exit(1); + } + + /* mmap again to a dummy address to hopefully trigger pmd sharing. */ + suggested_addr = 0x7daa40000000; + void *daddr = mmap((void *)suggested_addr, length, PROTECTION, + MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); + printf("Map daddr: Returned address is %p\n", daddr); + if (daddr == MAP_FAILED) { + perror("mmap3"); + exit(1); + } + + suggested_addr = 0x7faa40000000; + void *vaddr = + mmap((void *)suggested_addr, length, PROTECTION, FLAGS, -1, 0); + printf("Map vaddr: Returned address is %p\n", vaddr); + if (vaddr == MAP_FAILED) { + perror("mmap2"); + exit(1); + } + + register_region_with_uffd(haddr, length); + + void *addr = mremap(haddr, length, length, + MREMAP_MAYMOVE | MREMAP_FIXED, vaddr); + if (addr == MAP_FAILED) { + perror("mremap"); + exit(1); + } + + printf("Mremap: Returned address is %p\n", addr); + check_bytes(addr); + write_bytes(addr, length); + ret = read_bytes(addr, length); + + munmap(addr, length); + + addr = mremap(addr, length, length, 0); + if (addr != MAP_FAILED) { + printf("mremap: Expected failure, but call succeeded\n"); + exit(1); + } + + close(fd); + + return ret; +} diff --git a/tools/testing/selftests/mm/hugepage-shm.c b/tools/testing/selftests/mm/hugepage-shm.c new file mode 100644 index 000000000000..e2527f32005b --- /dev/null +++ b/tools/testing/selftests/mm/hugepage-shm.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugepage-shm: + * + * Example of using huge page memory in a user application using Sys V shared + * memory system calls. In this example the app is requesting 256MB of + * memory that is backed by huge pages. The application uses the flag + * SHM_HUGETLB in the shmget system call to inform the kernel that it is + * requesting huge pages. + * + * For the ia64 architecture, the Linux kernel reserves Region number 4 for + * huge pages. That means that if one requires a fixed address, a huge page + * aligned address starting with 0x800000... will be required. If a fixed + * address is not required, the kernel will select an address in the proper + * range. + * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. + * + * Note: The default shared memory limit is quite low on many kernels, + * you may need to increase it via: + * + * echo 268435456 > /proc/sys/kernel/shmmax + * + * This will increase the maximum size per shared memory segment to 256MB. + * The other limit that you will hit eventually is shmall which is the + * total amount of shared memory in pages. To set it to 16GB on a system + * with a 4kB pagesize do: + * + * echo 4194304 > /proc/sys/kernel/shmall + */ + +#include +#include +#include +#include +#include +#include + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + +#define LENGTH (256UL*1024*1024) + +#define dprintf(x) printf(x) + +/* Only ia64 requires this */ +#ifdef __ia64__ +#define ADDR (void *)(0x8000000000000000UL) +#define SHMAT_FLAGS (SHM_RND) +#else +#define ADDR (void *)(0x0UL) +#define SHMAT_FLAGS (0) +#endif + +int main(void) +{ + int shmid; + unsigned long i; + char *shmaddr; + + shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); + if (shmid < 0) { + perror("shmget"); + exit(1); + } + printf("shmid: 0x%x\n", shmid); + + shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS); + if (shmaddr == (char *)-1) { + perror("Shared memory attach failure"); + shmctl(shmid, IPC_RMID, NULL); + exit(2); + } + printf("shmaddr: %p\n", shmaddr); + + dprintf("Starting the writes:\n"); + for (i = 0; i < LENGTH; i++) { + shmaddr[i] = (char)(i); + if (!(i % (1024 * 1024))) + dprintf("."); + } + dprintf("\n"); + + dprintf("Starting the Check..."); + for (i = 0; i < LENGTH; i++) + if (shmaddr[i] != (char)i) { + printf("\nIndex %lu mismatched\n", i); + exit(3); + } + dprintf("Done.\n"); + + if (shmdt((const void *)shmaddr) != 0) { + perror("Detach failure"); + shmctl(shmid, IPC_RMID, NULL); + exit(4); + } + + shmctl(shmid, IPC_RMID, NULL); + + return 0; +} diff --git a/tools/testing/selftests/mm/hugepage-vmemmap.c b/tools/testing/selftests/mm/hugepage-vmemmap.c new file mode 100644 index 000000000000..557bdbd4f87e --- /dev/null +++ b/tools/testing/selftests/mm/hugepage-vmemmap.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test case of using hugepage memory in a user application using the + * mmap system call with MAP_HUGETLB flag. Before running this program + * make sure the administrator has allocated enough default sized huge + * pages to cover the 2 MB allocation. + */ +#include +#include +#include +#include +#include + +#define MAP_LENGTH (2UL * 1024 * 1024) + +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0x40000 /* arch specific */ +#endif + +#define PAGE_SIZE 4096 + +#define PAGE_COMPOUND_HEAD (1UL << 15) +#define PAGE_COMPOUND_TAIL (1UL << 16) +#define PAGE_HUGE (1UL << 17) + +#define HEAD_PAGE_FLAGS (PAGE_COMPOUND_HEAD | PAGE_HUGE) +#define TAIL_PAGE_FLAGS (PAGE_COMPOUND_TAIL | PAGE_HUGE) + +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK ~((1UL << PM_PFRAME_BITS) - 1) + +/* + * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * That means the addresses starting with 0x800000... will need to be + * specified. Specifying a fixed address is not required on ppc64, i386 + * or x86_64. + */ +#ifdef __ia64__ +#define MAP_ADDR (void *)(0x8000000000000000UL) +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) +#else +#define MAP_ADDR NULL +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +#endif + +static void write_bytes(char *addr, size_t length) +{ + unsigned long i; + + for (i = 0; i < length; i++) + *(addr + i) = (char)i; +} + +static unsigned long virt_to_pfn(void *addr) +{ + int fd; + unsigned long pagemap; + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) + return -1UL; + + lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET); + read(fd, &pagemap, sizeof(pagemap)); + close(fd); + + return pagemap & ~PM_PFRAME_MASK; +} + +static int check_page_flags(unsigned long pfn) +{ + int fd, i; + unsigned long pageflags; + + fd = open("/proc/kpageflags", O_RDONLY); + if (fd < 0) + return -1; + + lseek(fd, pfn * sizeof(pageflags), SEEK_SET); + + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) { + close(fd); + printf("Head page flags (%lx) is invalid\n", pageflags); + return -1; + } + + /* + * pages other than the first page must be tail and shouldn't be head; + * this also verifies kernel has correctly set the fake page_head to tail + * while hugetlb_free_vmemmap is enabled. + */ + for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) { + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS || + (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) { + close(fd); + printf("Tail page flags (%lx) is invalid\n", pageflags); + return -1; + } + } + + close(fd); + + return 0; +} + +int main(int argc, char **argv) +{ + void *addr; + unsigned long pfn; + + addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* Trigger allocation of HugeTLB page. */ + write_bytes(addr, MAP_LENGTH); + + pfn = virt_to_pfn(addr); + if (pfn == -1UL) { + munmap(addr, MAP_LENGTH); + perror("virt_to_pfn"); + exit(1); + } + + printf("Returned address is %p whose pfn is %lx\n", addr, pfn); + + if (check_page_flags(pfn) < 0) { + munmap(addr, MAP_LENGTH); + perror("check_page_flags"); + exit(1); + } + + /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ + if (munmap(addr, MAP_LENGTH)) { + perror("munmap"); + exit(1); + } + + return 0; +} diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c new file mode 100644 index 000000000000..a634f47d1e56 --- /dev/null +++ b/tools/testing/selftests/mm/hugetlb-madvise.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugepage-madvise: + * + * Basic functional testing of madvise MADV_DONTNEED and MADV_REMOVE + * on hugetlb mappings. + * + * Before running this test, make sure the administrator has pre-allocated + * at least MIN_FREE_PAGES hugetlb pages and they are free. In addition, + * the test takes an argument that is the path to a file in a hugetlbfs + * filesystem. Therefore, a hugetlbfs filesystem must be mounted on some + * directory. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#define __USE_GNU +#include + +#define MIN_FREE_PAGES 20 +#define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */ + +#define validate_free_pages(exp_free) \ + do { \ + int fhp = get_free_hugepages(); \ + if (fhp != (exp_free)) { \ + printf("Unexpected number of free huge " \ + "pages line %d\n", __LINE__); \ + exit(1); \ + } \ + } while (0) + +unsigned long huge_page_size; +unsigned long base_page_size; + +/* + * default_huge_page_size copied from mlock2-tests.c + */ +unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + + free(line); + fclose(f); + return hps; +} + +unsigned long get_free_hugepages(void) +{ + unsigned long fhp = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return fhp; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "HugePages_Free: %lu", &fhp) == 1) + break; + } + + free(line); + fclose(f); + return fhp; +} + +void write_fault_pages(void *addr, unsigned long nr_pages) +{ + unsigned long i; + + for (i = 0; i < nr_pages; i++) + *((unsigned long *)(addr + (i * huge_page_size))) = i; +} + +void read_fault_pages(void *addr, unsigned long nr_pages) +{ + unsigned long dummy = 0; + unsigned long i; + + for (i = 0; i < nr_pages; i++) + dummy += *((unsigned long *)(addr + (i * huge_page_size))); +} + +int main(int argc, char **argv) +{ + unsigned long free_hugepages; + void *addr, *addr2; + int fd; + int ret; + + huge_page_size = default_huge_page_size(); + if (!huge_page_size) { + printf("Unable to determine huge page size, exiting!\n"); + exit(1); + } + base_page_size = sysconf(_SC_PAGE_SIZE); + if (!huge_page_size) { + printf("Unable to determine base page size, exiting!\n"); + exit(1); + } + + free_hugepages = get_free_hugepages(); + if (free_hugepages < MIN_FREE_PAGES) { + printf("Not enough free huge pages to test, exiting!\n"); + exit(1); + } + + fd = memfd_create(argv[0], MFD_HUGETLB); + if (fd < 0) { + perror("memfd_create() failed"); + exit(1); + } + + /* + * Test validity of MADV_DONTNEED addr and length arguments. mmap + * size is NR_HUGE_PAGES + 2. One page at the beginning and end of + * the mapping will be unmapped so we KNOW there is nothing mapped + * there. + */ + addr = mmap(NULL, (NR_HUGE_PAGES + 2) * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + if (munmap(addr, huge_page_size) || + munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size, + huge_page_size)) { + perror("munmap"); + exit(1); + } + addr = addr + huge_page_size; + + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* addr before mapping should fail */ + ret = madvise(addr - base_page_size, NR_HUGE_PAGES * huge_page_size, + MADV_DONTNEED); + if (!ret) { + printf("Unexpected success of madvise call with invalid addr line %d\n", + __LINE__); + exit(1); + } + + /* addr + length after mapping should fail */ + ret = madvise(addr, (NR_HUGE_PAGES * huge_page_size) + base_page_size, + MADV_DONTNEED); + if (!ret) { + printf("Unexpected success of madvise call with invalid length line %d\n", + __LINE__); + exit(1); + } + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test alignment of MADV_DONTNEED addr and length arguments + */ + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* addr is not huge page size aligned and should fail */ + ret = madvise(addr + base_page_size, + NR_HUGE_PAGES * huge_page_size - base_page_size, + MADV_DONTNEED); + if (!ret) { + printf("Unexpected success of madvise call with unaligned start address %d\n", + __LINE__); + exit(1); + } + + /* addr + length should be aligned down to huge page size */ + if (madvise(addr, + ((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size, + MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + + /* should free all but last page in mapping */ + validate_free_pages(free_hugepages - 1); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + validate_free_pages(free_hugepages); + + /* + * Test MADV_DONTNEED on anonymous private mapping + */ + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + + /* should free all pages in mapping */ + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_DONTNEED on private mapping of hugetlb file + */ + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* read should not consume any pages */ + read_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* madvise should not free any pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* writes should allocate private pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* madvise should free private pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* writes should allocate private pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* + * The fallocate below certainly should free the pages associated + * with the file. However, pages in the private mapping are also + * freed. This is not the 'correct' behavior, but is expected + * because this is how it has worked since the initial hugetlb + * implementation. + */ + if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_DONTNEED on shared mapping of hugetlb file + */ + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* write should not consume any pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* madvise should not free any pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* + * Test MADV_REMOVE on shared mapping of hugetlb file + * + * madvise is same as hole punch and should free all pages. + */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages); + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_REMOVE on shared and private mapping of hugetlb file + */ + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* shared write should not consume any additional pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr2 = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (addr2 == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* private read should not consume any pages */ + read_fault_pages(addr2, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* private write should consume additional pages */ + write_fault_pages(addr2, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* madvise of shared mapping should not free any pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* madvise of private mapping should free private pages */ + if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* private write should consume additional pages again */ + write_fault_pages(addr2, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* + * madvise should free both file and private pages although this is + * not correct. private pages should not be freed, but this is + * expected. See comment associated with FALLOC_FL_PUNCH_HOLE call. + */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + (void)munmap(addr2, NR_HUGE_PAGES * huge_page_size); + + close(fd); + return 0; +} diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh new file mode 100644 index 000000000000..bf2d2a684edf --- /dev/null +++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh @@ -0,0 +1,252 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +set -e + +if [[ $(id -u) -ne 0 ]]; then + echo "This test must be run as root. Skipping..." + exit $ksft_skip +fi + +usage_file=usage_in_bytes + +if [[ "$1" == "-cgroup-v2" ]]; then + cgroup2=1 + usage_file=current +fi + + +if [[ $cgroup2 ]]; then + CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk -e '{print $3}') + if [[ -z "$CGROUP_ROOT" ]]; then + CGROUP_ROOT=/dev/cgroup/memory + mount -t cgroup2 none $CGROUP_ROOT + do_umount=1 + fi + echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control +else + CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}') + if [[ -z "$CGROUP_ROOT" ]]; then + CGROUP_ROOT=/dev/cgroup/memory + mount -t cgroup memory,hugetlb $CGROUP_ROOT + do_umount=1 + fi +fi +MNT='/mnt/huge/' + +function get_machine_hugepage_size() { + hpz=$(grep -i hugepagesize /proc/meminfo) + kb=${hpz:14:-3} + mb=$(($kb / 1024)) + echo $mb +} + +MB=$(get_machine_hugepage_size) + +function cleanup() { + echo cleanup + set +e + rm -rf "$MNT"/* 2>/dev/null + umount "$MNT" 2>/dev/null + rmdir "$MNT" 2>/dev/null + rmdir "$CGROUP_ROOT"/a/b 2>/dev/null + rmdir "$CGROUP_ROOT"/a 2>/dev/null + rmdir "$CGROUP_ROOT"/test1 2>/dev/null + echo 0 >/proc/sys/vm/nr_hugepages + set -e +} + +function assert_state() { + local expected_a="$1" + local expected_a_hugetlb="$2" + local expected_b="" + local expected_b_hugetlb="" + + if [ ! -z ${3:-} ] && [ ! -z ${4:-} ]; then + expected_b="$3" + expected_b_hugetlb="$4" + fi + local tolerance=$((5 * 1024 * 1024)) + + local actual_a + actual_a="$(cat "$CGROUP_ROOT"/a/memory.$usage_file)" + if [[ $actual_a -lt $(($expected_a - $tolerance)) ]] || + [[ $actual_a -gt $(($expected_a + $tolerance)) ]]; then + echo actual a = $((${actual_a%% *} / 1024 / 1024)) MB + echo expected a = $((${expected_a%% *} / 1024 / 1024)) MB + echo fail + + cleanup + exit 1 + fi + + local actual_a_hugetlb + actual_a_hugetlb="$(cat "$CGROUP_ROOT"/a/hugetlb.${MB}MB.$usage_file)" + if [[ $actual_a_hugetlb -lt $(($expected_a_hugetlb - $tolerance)) ]] || + [[ $actual_a_hugetlb -gt $(($expected_a_hugetlb + $tolerance)) ]]; then + echo actual a hugetlb = $((${actual_a_hugetlb%% *} / 1024 / 1024)) MB + echo expected a hugetlb = $((${expected_a_hugetlb%% *} / 1024 / 1024)) MB + echo fail + + cleanup + exit 1 + fi + + if [[ -z "$expected_b" || -z "$expected_b_hugetlb" ]]; then + return + fi + + local actual_b + actual_b="$(cat "$CGROUP_ROOT"/a/b/memory.$usage_file)" + if [[ $actual_b -lt $(($expected_b - $tolerance)) ]] || + [[ $actual_b -gt $(($expected_b + $tolerance)) ]]; then + echo actual b = $((${actual_b%% *} / 1024 / 1024)) MB + echo expected b = $((${expected_b%% *} / 1024 / 1024)) MB + echo fail + + cleanup + exit 1 + fi + + local actual_b_hugetlb + actual_b_hugetlb="$(cat "$CGROUP_ROOT"/a/b/hugetlb.${MB}MB.$usage_file)" + if [[ $actual_b_hugetlb -lt $(($expected_b_hugetlb - $tolerance)) ]] || + [[ $actual_b_hugetlb -gt $(($expected_b_hugetlb + $tolerance)) ]]; then + echo actual b hugetlb = $((${actual_b_hugetlb%% *} / 1024 / 1024)) MB + echo expected b hugetlb = $((${expected_b_hugetlb%% *} / 1024 / 1024)) MB + echo fail + + cleanup + exit 1 + fi +} + +function setup() { + echo 100 >/proc/sys/vm/nr_hugepages + mkdir "$CGROUP_ROOT"/a + sleep 1 + if [[ $cgroup2 ]]; then + echo "+hugetlb +memory" >$CGROUP_ROOT/a/cgroup.subtree_control + else + echo 0 >$CGROUP_ROOT/a/cpuset.mems + echo 0 >$CGROUP_ROOT/a/cpuset.cpus + fi + + mkdir "$CGROUP_ROOT"/a/b + + if [[ ! $cgroup2 ]]; then + echo 0 >$CGROUP_ROOT/a/b/cpuset.mems + echo 0 >$CGROUP_ROOT/a/b/cpuset.cpus + fi + + mkdir -p "$MNT" + mount -t hugetlbfs none "$MNT" +} + +write_hugetlbfs() { + local cgroup="$1" + local path="$2" + local size="$3" + + if [[ $cgroup2 ]]; then + echo $$ >$CGROUP_ROOT/$cgroup/cgroup.procs + else + echo 0 >$CGROUP_ROOT/$cgroup/cpuset.mems + echo 0 >$CGROUP_ROOT/$cgroup/cpuset.cpus + echo $$ >"$CGROUP_ROOT/$cgroup/tasks" + fi + ./write_to_hugetlbfs -p "$path" -s "$size" -m 0 -o + if [[ $cgroup2 ]]; then + echo $$ >$CGROUP_ROOT/cgroup.procs + else + echo $$ >"$CGROUP_ROOT/tasks" + fi + echo +} + +set -e + +size=$((${MB} * 1024 * 1024 * 25)) # 50MB = 25 * 2MB hugepages. + +cleanup + +echo +echo +echo Test charge, rmdir, uncharge +setup +echo mkdir +mkdir $CGROUP_ROOT/test1 + +echo write +write_hugetlbfs test1 "$MNT"/test $size + +echo rmdir +rmdir $CGROUP_ROOT/test1 +mkdir $CGROUP_ROOT/test1 + +echo uncharge +rm -rf /mnt/huge/* + +cleanup + +echo done +echo +echo +if [[ ! $cgroup2 ]]; then + echo "Test parent and child hugetlb usage" + setup + + echo write + write_hugetlbfs a "$MNT"/test $size + + echo Assert memory charged correctly for parent use. + assert_state 0 $size 0 0 + + write_hugetlbfs a/b "$MNT"/test2 $size + + echo Assert memory charged correctly for child use. + assert_state 0 $(($size * 2)) 0 $size + + rmdir "$CGROUP_ROOT"/a/b + sleep 5 + echo Assert memory reparent correctly. + assert_state 0 $(($size * 2)) + + rm -rf "$MNT"/* + umount "$MNT" + echo Assert memory uncharged correctly. + assert_state 0 0 + + cleanup +fi + +echo +echo +echo "Test child only hugetlb usage" +echo setup +setup + +echo write +write_hugetlbfs a/b "$MNT"/test2 $size + +echo Assert memory charged correctly for child only use. +assert_state 0 $(($size)) 0 $size + +rmdir "$CGROUP_ROOT"/a/b +echo Assert memory reparent correctly. +assert_state 0 $size + +rm -rf "$MNT"/* +umount "$MNT" +echo Assert memory uncharged correctly. +assert_state 0 0 + +cleanup + +echo ALL PASS + +umount $CGROUP_ROOT +rm -rf $CGROUP_ROOT diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c new file mode 100644 index 000000000000..64126c8cd561 --- /dev/null +++ b/tools/testing/selftests/mm/khugepaged.c @@ -0,0 +1,1558 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "linux/magic.h" + +#include "vm_util.h" + +#ifndef MADV_PAGEOUT +#define MADV_PAGEOUT 21 +#endif +#ifndef MADV_POPULATE_READ +#define MADV_POPULATE_READ 22 +#endif +#ifndef MADV_COLLAPSE +#define MADV_COLLAPSE 25 +#endif + +#define BASE_ADDR ((void *)(1UL << 30)) +static unsigned long hpage_pmd_size; +static unsigned long page_size; +static int hpage_pmd_nr; + +#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" +#define PID_SMAPS "/proc/self/smaps" +#define TEST_FILE "collapse_test_file" + +#define MAX_LINE_LENGTH 500 + +enum vma_type { + VMA_ANON, + VMA_FILE, + VMA_SHMEM, +}; + +struct mem_ops { + void *(*setup_area)(int nr_hpages); + void (*cleanup_area)(void *p, unsigned long size); + void (*fault)(void *p, unsigned long start, unsigned long end); + bool (*check_huge)(void *addr, int nr_hpages); + const char *name; +}; + +static struct mem_ops *file_ops; +static struct mem_ops *anon_ops; +static struct mem_ops *shmem_ops; + +struct collapse_context { + void (*collapse)(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect); + bool enforce_pte_scan_limits; + const char *name; +}; + +static struct collapse_context *khugepaged_context; +static struct collapse_context *madvise_context; + +struct file_info { + const char *dir; + char path[PATH_MAX]; + enum vma_type type; + int fd; + char dev_queue_read_ahead_path[PATH_MAX]; +}; + +static struct file_info finfo; + +enum thp_enabled { + THP_ALWAYS, + THP_MADVISE, + THP_NEVER, +}; + +static const char *thp_enabled_strings[] = { + "always", + "madvise", + "never", + NULL +}; + +enum thp_defrag { + THP_DEFRAG_ALWAYS, + THP_DEFRAG_DEFER, + THP_DEFRAG_DEFER_MADVISE, + THP_DEFRAG_MADVISE, + THP_DEFRAG_NEVER, +}; + +static const char *thp_defrag_strings[] = { + "always", + "defer", + "defer+madvise", + "madvise", + "never", + NULL +}; + +enum shmem_enabled { + SHMEM_ALWAYS, + SHMEM_WITHIN_SIZE, + SHMEM_ADVISE, + SHMEM_NEVER, + SHMEM_DENY, + SHMEM_FORCE, +}; + +static const char *shmem_enabled_strings[] = { + "always", + "within_size", + "advise", + "never", + "deny", + "force", + NULL +}; + +struct khugepaged_settings { + bool defrag; + unsigned int alloc_sleep_millisecs; + unsigned int scan_sleep_millisecs; + unsigned int max_ptes_none; + unsigned int max_ptes_swap; + unsigned int max_ptes_shared; + unsigned long pages_to_scan; +}; + +struct settings { + enum thp_enabled thp_enabled; + enum thp_defrag thp_defrag; + enum shmem_enabled shmem_enabled; + bool use_zero_page; + struct khugepaged_settings khugepaged; + unsigned long read_ahead_kb; +}; + +static struct settings saved_settings; +static bool skip_settings_restore; + +static int exit_status; + +static void success(const char *msg) +{ + printf(" \e[32m%s\e[0m\n", msg); +} + +static void fail(const char *msg) +{ + printf(" \e[31m%s\e[0m\n", msg); + exit_status++; +} + +static void skip(const char *msg) +{ + printf(" \e[33m%s\e[0m\n", msg); +} + +static int read_file(const char *path, char *buf, size_t buflen) +{ + int fd; + ssize_t numread; + + fd = open(path, O_RDONLY); + if (fd == -1) + return 0; + + numread = read(fd, buf, buflen - 1); + if (numread < 1) { + close(fd); + return 0; + } + + buf[numread] = '\0'; + close(fd); + + return (unsigned int) numread; +} + +static int write_file(const char *path, const char *buf, size_t buflen) +{ + int fd; + ssize_t numwritten; + + fd = open(path, O_WRONLY); + if (fd == -1) { + printf("open(%s)\n", path); + exit(EXIT_FAILURE); + return 0; + } + + numwritten = write(fd, buf, buflen - 1); + close(fd); + if (numwritten < 1) { + printf("write(%s)\n", buf); + exit(EXIT_FAILURE); + return 0; + } + + return (unsigned int) numwritten; +} + +static int read_string(const char *name, const char *strings[]) +{ + char path[PATH_MAX]; + char buf[256]; + char *c; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!read_file(path, buf, sizeof(buf))) { + perror(path); + exit(EXIT_FAILURE); + } + + c = strchr(buf, '['); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + + c++; + memmove(buf, c, sizeof(buf) - (c - buf)); + + c = strchr(buf, ']'); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + *c = '\0'; + + ret = 0; + while (strings[ret]) { + if (!strcmp(strings[ret], buf)) + return ret; + ret++; + } + + printf("Failed to parse %s\n", name); + exit(EXIT_FAILURE); +} + +static void write_string(const char *name, const char *val) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!write_file(path, val, strlen(val) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +static const unsigned long _read_num(const char *path) +{ + char buf[21]; + + if (read_file(path, buf, sizeof(buf)) < 0) { + perror("read_file(read_num)"); + exit(EXIT_FAILURE); + } + + return strtoul(buf, NULL, 10); +} + +static const unsigned long read_num(const char *name) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return _read_num(path); +} + +static void _write_num(const char *path, unsigned long num) +{ + char buf[21]; + + sprintf(buf, "%ld", num); + if (!write_file(path, buf, strlen(buf) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +static void write_num(const char *name, unsigned long num) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + _write_num(path, num); +} + +static void write_settings(struct settings *settings) +{ + struct khugepaged_settings *khugepaged = &settings->khugepaged; + + write_string("enabled", thp_enabled_strings[settings->thp_enabled]); + write_string("defrag", thp_defrag_strings[settings->thp_defrag]); + write_string("shmem_enabled", + shmem_enabled_strings[settings->shmem_enabled]); + write_num("use_zero_page", settings->use_zero_page); + + write_num("khugepaged/defrag", khugepaged->defrag); + write_num("khugepaged/alloc_sleep_millisecs", + khugepaged->alloc_sleep_millisecs); + write_num("khugepaged/scan_sleep_millisecs", + khugepaged->scan_sleep_millisecs); + write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none); + write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); + write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); + write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); + + if (file_ops && finfo.type == VMA_FILE) + _write_num(finfo.dev_queue_read_ahead_path, + settings->read_ahead_kb); +} + +#define MAX_SETTINGS_DEPTH 4 +static struct settings settings_stack[MAX_SETTINGS_DEPTH]; +static int settings_index; + +static struct settings *current_settings(void) +{ + if (!settings_index) { + printf("Fail: No settings set"); + exit(EXIT_FAILURE); + } + return settings_stack + settings_index - 1; +} + +static void push_settings(struct settings *settings) +{ + if (settings_index >= MAX_SETTINGS_DEPTH) { + printf("Fail: Settings stack exceeded"); + exit(EXIT_FAILURE); + } + settings_stack[settings_index++] = *settings; + write_settings(current_settings()); +} + +static void pop_settings(void) +{ + if (settings_index <= 0) { + printf("Fail: Settings stack empty"); + exit(EXIT_FAILURE); + } + --settings_index; + write_settings(current_settings()); +} + +static void restore_settings(int sig) +{ + if (skip_settings_restore) + goto out; + + printf("Restore THP and khugepaged settings..."); + write_settings(&saved_settings); + success("OK"); + if (sig) + exit(EXIT_FAILURE); +out: + exit(exit_status); +} + +static void save_settings(void) +{ + printf("Save THP and khugepaged settings..."); + saved_settings = (struct settings) { + .thp_enabled = read_string("enabled", thp_enabled_strings), + .thp_defrag = read_string("defrag", thp_defrag_strings), + .shmem_enabled = + read_string("shmem_enabled", shmem_enabled_strings), + .use_zero_page = read_num("use_zero_page"), + }; + saved_settings.khugepaged = (struct khugepaged_settings) { + .defrag = read_num("khugepaged/defrag"), + .alloc_sleep_millisecs = + read_num("khugepaged/alloc_sleep_millisecs"), + .scan_sleep_millisecs = + read_num("khugepaged/scan_sleep_millisecs"), + .max_ptes_none = read_num("khugepaged/max_ptes_none"), + .max_ptes_swap = read_num("khugepaged/max_ptes_swap"), + .max_ptes_shared = read_num("khugepaged/max_ptes_shared"), + .pages_to_scan = read_num("khugepaged/pages_to_scan"), + }; + if (file_ops && finfo.type == VMA_FILE) + saved_settings.read_ahead_kb = + _read_num(finfo.dev_queue_read_ahead_path); + + success("OK"); + + signal(SIGTERM, restore_settings); + signal(SIGINT, restore_settings); + signal(SIGHUP, restore_settings); + signal(SIGQUIT, restore_settings); +} + +static void get_finfo(const char *dir) +{ + struct stat path_stat; + struct statfs fs; + char buf[1 << 10]; + char path[PATH_MAX]; + char *str, *end; + + finfo.dir = dir; + stat(finfo.dir, &path_stat); + if (!S_ISDIR(path_stat.st_mode)) { + printf("%s: Not a directory (%s)\n", __func__, finfo.dir); + exit(EXIT_FAILURE); + } + if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE, + finfo.dir) >= sizeof(finfo.path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + if (statfs(finfo.dir, &fs)) { + perror("statfs()"); + exit(EXIT_FAILURE); + } + finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE; + if (finfo.type == VMA_SHMEM) + return; + + /* Find owning device's queue/read_ahead_kb control */ + if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent", + major(path_stat.st_dev), minor(path_stat.st_dev)) + >= sizeof(path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + if (read_file(path, buf, sizeof(buf)) < 0) { + perror("read_file(read_num)"); + exit(EXIT_FAILURE); + } + if (strstr(buf, "DEVTYPE=disk")) { + /* Found it */ + if (snprintf(finfo.dev_queue_read_ahead_path, + sizeof(finfo.dev_queue_read_ahead_path), + "/sys/dev/block/%d:%d/queue/read_ahead_kb", + major(path_stat.st_dev), minor(path_stat.st_dev)) + >= sizeof(finfo.dev_queue_read_ahead_path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return; + } + if (!strstr(buf, "DEVTYPE=partition")) { + printf("%s: Unknown device type: %s\n", __func__, path); + exit(EXIT_FAILURE); + } + /* + * Partition of block device - need to find actual device. + * Using naming convention that devnameN is partition of + * device devname. + */ + str = strstr(buf, "DEVNAME="); + if (!str) { + printf("%s: Could not read: %s", __func__, path); + exit(EXIT_FAILURE); + } + str += 8; + end = str; + while (*end) { + if (isdigit(*end)) { + *end = '\0'; + if (snprintf(finfo.dev_queue_read_ahead_path, + sizeof(finfo.dev_queue_read_ahead_path), + "/sys/block/%s/queue/read_ahead_kb", + str) >= sizeof(finfo.dev_queue_read_ahead_path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return; + } + ++end; + } + printf("%s: Could not read: %s\n", __func__, path); + exit(EXIT_FAILURE); +} + +static bool check_swap(void *addr, unsigned long size) +{ + bool swap = false; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + + + fp = fopen(PID_SMAPS, "r"); + if (!fp) { + printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); + exit(EXIT_FAILURE); + } + if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) + goto err_out; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", + size >> 10); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + /* + * Fetch the Swap: in the same block and check whether it got + * the expected number of hugeepages next. + */ + if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer))) + goto err_out; + + if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) + goto err_out; + + swap = true; +err_out: + fclose(fp); + return swap; +} + +static void *alloc_mapping(int nr) +{ + void *p; + + p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (p != BASE_ADDR) { + printf("Failed to allocate VMA at %p\n", BASE_ADDR); + exit(EXIT_FAILURE); + } + + return p; +} + +static void fill_memory(int *p, unsigned long start, unsigned long end) +{ + int i; + + for (i = start / page_size; i < end / page_size; i++) + p[i * page_size / sizeof(*p)] = i + 0xdead0000; +} + +/* + * MADV_COLLAPSE is a best-effort request and may fail if an internal + * resource is temporarily unavailable, in which case it will set errno to + * EAGAIN. In such a case, immediately reattempt the operation one more + * time. + */ +static int madvise_collapse_retry(void *p, unsigned long size) +{ + bool retry = true; + int ret; + +retry: + ret = madvise(p, size, MADV_COLLAPSE); + if (ret && errno == EAGAIN && retry) { + retry = false; + goto retry; + } + return ret; +} + +/* + * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with + * validate_memory()'able contents. + */ +static void *alloc_hpage(struct mem_ops *ops) +{ + void *p = ops->setup_area(1); + + ops->fault(p, 0, hpage_pmd_size); + + /* + * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE. + * The latter is ineligible for collapse by MADV_COLLAPSE + * while the former might cause MADV_COLLAPSE to race with + * khugepaged on low-load system (like a test machine), which + * would cause MADV_COLLAPSE to fail with EAGAIN. + */ + printf("Allocate huge page..."); + if (madvise_collapse_retry(p, hpage_pmd_size)) { + perror("madvise(MADV_COLLAPSE)"); + exit(EXIT_FAILURE); + } + if (!ops->check_huge(p, 1)) { + perror("madvise(MADV_COLLAPSE)"); + exit(EXIT_FAILURE); + } + if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) { + perror("madvise(MADV_HUGEPAGE)"); + exit(EXIT_FAILURE); + } + success("OK"); + return p; +} + +static void validate_memory(int *p, unsigned long start, unsigned long end) +{ + int i; + + for (i = start / page_size; i < end / page_size; i++) { + if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) { + printf("Page %d is corrupted: %#x\n", + i, p[i * page_size / sizeof(*p)]); + exit(EXIT_FAILURE); + } + } +} + +static void *anon_setup_area(int nr_hpages) +{ + return alloc_mapping(nr_hpages); +} + +static void anon_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); +} + +static void anon_fault(void *p, unsigned long start, unsigned long end) +{ + fill_memory(p, start, end); +} + +static bool anon_check_huge(void *addr, int nr_hpages) +{ + return check_huge_anon(addr, nr_hpages, hpage_pmd_size); +} + +static void *file_setup_area(int nr_hpages) +{ + int fd; + void *p; + unsigned long size; + + unlink(finfo.path); /* Cleanup from previous failed tests */ + printf("Creating %s for collapse%s...", finfo.path, + finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); + fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL, + 777); + if (fd < 0) { + perror("open()"); + exit(EXIT_FAILURE); + } + + size = nr_hpages * hpage_pmd_size; + p = alloc_mapping(nr_hpages); + fill_memory(p, 0, size); + write(fd, p, size); + close(fd); + munmap(p, size); + success("OK"); + + printf("Opening %s read only for collapse...", finfo.path); + finfo.fd = open(finfo.path, O_RDONLY, 777); + if (finfo.fd < 0) { + perror("open()"); + exit(EXIT_FAILURE); + } + p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC, + MAP_PRIVATE, finfo.fd, 0); + if (p == MAP_FAILED || p != BASE_ADDR) { + perror("mmap()"); + exit(EXIT_FAILURE); + } + + /* Drop page cache */ + write_file("/proc/sys/vm/drop_caches", "3", 2); + success("OK"); + return p; +} + +static void file_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); + close(finfo.fd); + unlink(finfo.path); +} + +static void file_fault(void *p, unsigned long start, unsigned long end) +{ + if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) { + perror("madvise(MADV_POPULATE_READ"); + exit(EXIT_FAILURE); + } +} + +static bool file_check_huge(void *addr, int nr_hpages) +{ + switch (finfo.type) { + case VMA_FILE: + return check_huge_file(addr, nr_hpages, hpage_pmd_size); + case VMA_SHMEM: + return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); + default: + exit(EXIT_FAILURE); + return false; + } +} + +static void *shmem_setup_area(int nr_hpages) +{ + void *p; + unsigned long size = nr_hpages * hpage_pmd_size; + + finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0); + if (finfo.fd < 0) { + perror("memfd_create()"); + exit(EXIT_FAILURE); + } + if (ftruncate(finfo.fd, size)) { + perror("ftruncate()"); + exit(EXIT_FAILURE); + } + p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd, + 0); + if (p != BASE_ADDR) { + perror("mmap()"); + exit(EXIT_FAILURE); + } + return p; +} + +static void shmem_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); + close(finfo.fd); +} + +static bool shmem_check_huge(void *addr, int nr_hpages) +{ + return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); +} + +static struct mem_ops __anon_ops = { + .setup_area = &anon_setup_area, + .cleanup_area = &anon_cleanup_area, + .fault = &anon_fault, + .check_huge = &anon_check_huge, + .name = "anon", +}; + +static struct mem_ops __file_ops = { + .setup_area = &file_setup_area, + .cleanup_area = &file_cleanup_area, + .fault = &file_fault, + .check_huge = &file_check_huge, + .name = "file", +}; + +static struct mem_ops __shmem_ops = { + .setup_area = &shmem_setup_area, + .cleanup_area = &shmem_cleanup_area, + .fault = &anon_fault, + .check_huge = &shmem_check_huge, + .name = "shmem", +}; + +static void __madvise_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) +{ + int ret; + struct settings settings = *current_settings(); + + printf("%s...", msg); + + /* + * Prevent khugepaged interference and tests that MADV_COLLAPSE + * ignores /sys/kernel/mm/transparent_hugepage/enabled + */ + settings.thp_enabled = THP_NEVER; + settings.shmem_enabled = SHMEM_NEVER; + push_settings(&settings); + + /* Clear VM_NOHUGEPAGE */ + madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); + ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size); + if (((bool)ret) == expect) + fail("Fail: Bad return value"); + else if (!ops->check_huge(p, expect ? nr_hpages : 0)) + fail("Fail: check_huge()"); + else + success("OK"); + + pop_settings(); +} + +static void madvise_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) +{ + /* Sanity check */ + if (!ops->check_huge(p, 0)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + __madvise_collapse(msg, p, nr_hpages, ops, expect); +} + +#define TICK 500000 +static bool wait_for_scan(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops) +{ + int full_scans; + int timeout = 6; /* 3 seconds */ + + /* Sanity check */ + if (!ops->check_huge(p, 0)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + + madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); + + /* Wait until the second full_scan completed */ + full_scans = read_num("khugepaged/full_scans") + 2; + + printf("%s...", msg); + while (timeout--) { + if (ops->check_huge(p, nr_hpages)) + break; + if (read_num("khugepaged/full_scans") >= full_scans) + break; + printf("."); + usleep(TICK); + } + + madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE); + + return timeout == -1; +} + +static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) +{ + if (wait_for_scan(msg, p, nr_hpages, ops)) { + if (expect) + fail("Timeout"); + else + success("OK"); + return; + } + + /* + * For file and shmem memory, khugepaged only retracts pte entries after + * putting the new hugepage in the page cache. The hugepage must be + * subsequently refaulted to install the pmd mapping for the mm. + */ + if (ops != &__anon_ops) + ops->fault(p, 0, nr_hpages * hpage_pmd_size); + + if (ops->check_huge(p, expect ? nr_hpages : 0)) + success("OK"); + else + fail("Fail"); +} + +static struct collapse_context __khugepaged_context = { + .collapse = &khugepaged_collapse, + .enforce_pte_scan_limits = true, + .name = "khugepaged", +}; + +static struct collapse_context __madvise_context = { + .collapse = &madvise_collapse, + .enforce_pte_scan_limits = false, + .name = "madvise", +}; + +static bool is_tmpfs(struct mem_ops *ops) +{ + return ops == &__file_ops && finfo.type == VMA_SHMEM; +} + +static void alloc_at_fault(void) +{ + struct settings settings = *current_settings(); + char *p; + + settings.thp_enabled = THP_ALWAYS; + push_settings(&settings); + + p = alloc_mapping(1); + *p = 1; + printf("Allocate huge page on fault..."); + if (check_huge_anon(p, 1, hpage_pmd_size)) + success("OK"); + else + fail("Fail"); + + pop_settings(); + + madvise(p, page_size, MADV_DONTNEED); + printf("Split huge PMD on MADV_DONTNEED..."); + if (check_huge_anon(p, 0, hpage_pmd_size)) + success("OK"); + else + fail("Fail"); + munmap(p, hpage_pmd_size); +} + +static void collapse_full(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + int nr_hpages = 4; + unsigned long size = nr_hpages * hpage_pmd_size; + + p = ops->setup_area(nr_hpages); + ops->fault(p, 0, size); + c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages, + ops, true); + validate_memory(p, 0, size); + ops->cleanup_area(p, size); +} + +static void collapse_empty(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = ops->setup_area(1); + c->collapse("Do not collapse empty PTE table", p, 1, ops, false); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = ops->setup_area(1); + ops->fault(p, 0, page_size); + c->collapse("Collapse PTE table with single PTE entry present", p, + 1, ops, true); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops) +{ + int max_ptes_none = hpage_pmd_nr / 2; + struct settings settings = *current_settings(); + void *p; + + settings.khugepaged.max_ptes_none = max_ptes_none; + push_settings(&settings); + + p = ops->setup_area(1); + + if (is_tmpfs(ops)) { + /* shmem pages always in the page cache */ + printf("tmpfs..."); + skip("Skip"); + goto skip; + } + + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, + ops, !c->enforce_pte_scan_limits); + validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + + if (c->enforce_pte_scan_limits) { + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops, + true); + validate_memory(p, 0, + (hpage_pmd_nr - max_ptes_none) * page_size); + } +skip: + ops->cleanup_area(p, hpage_pmd_size); + pop_settings(); +} + +static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); + + printf("Swapout one page..."); + if (madvise(p, page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + c->collapse("Collapse with swapping in single PTE entry", p, 1, ops, + true); + validate_memory(p, 0, hpage_pmd_size); +out: + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops) +{ + int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); + void *p; + + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); + + printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); + if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, (max_ptes_swap + 1) * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops, + !c->enforce_pte_scan_limits); + validate_memory(p, 0, hpage_pmd_size); + + if (c->enforce_pte_scan_limits) { + ops->fault(p, 0, hpage_pmd_size); + printf("Swapout %d of %d pages...", max_ptes_swap, + hpage_pmd_nr); + if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, max_ptes_swap * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + c->collapse("Collapse with max_ptes_swap pages swapped out", p, + 1, ops, true); + validate_memory(p, 0, hpage_pmd_size); + } +out: + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = alloc_hpage(ops); + + if (is_tmpfs(ops)) { + /* MADV_DONTNEED won't evict tmpfs pages */ + printf("tmpfs..."); + skip("Skip"); + goto skip; + } + + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + printf("Split huge page leaving single PTE mapping compound page..."); + madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse PTE table with single PTE mapping compound page", + p, 1, ops, true); + validate_memory(p, 0, page_size); +skip: + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = alloc_hpage(ops); + printf("Split huge page leaving single PTE page table full of compound pages..."); + madvise(p, page_size, MADV_NOHUGEPAGE); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse PTE table full of compound pages", p, 1, ops, + true); + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + int i; + + p = ops->setup_area(1); + for (i = 0; i < hpage_pmd_nr; i++) { + printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", + i + 1, hpage_pmd_nr); + + madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); + ops->fault(BASE_ADDR, 0, hpage_pmd_size); + if (!ops->check_huge(BASE_ADDR, 1)) { + printf("Failed to allocate huge page\n"); + exit(EXIT_FAILURE); + } + madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE); + + p = mremap(BASE_ADDR - i * page_size, + i * page_size + hpage_pmd_size, + (i + 1) * page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, + BASE_ADDR + 2 * hpage_pmd_size); + if (p == MAP_FAILED) { + perror("mremap+unmap"); + exit(EXIT_FAILURE); + } + + p = mremap(BASE_ADDR + 2 * hpage_pmd_size, + (i + 1) * page_size, + (i + 1) * page_size + hpage_pmd_size, + MREMAP_MAYMOVE | MREMAP_FIXED, + BASE_ADDR - (i + 1) * page_size); + if (p == MAP_FAILED) { + perror("mremap+alloc"); + exit(EXIT_FAILURE); + } + } + + ops->cleanup_area(BASE_ADDR, hpage_pmd_size); + ops->fault(p, 0, hpage_pmd_size); + if (!ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse PTE table full of different compound pages", p, 1, + ops, true); + + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) +{ + int wstatus; + void *p; + + p = ops->setup_area(1); + + printf("Allocate small page..."); + ops->fault(p, 0, page_size); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + printf("Share small page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + ops->fault(p, page_size, 2 * page_size); + c->collapse("Collapse PTE table with single page shared with parent process", + p, 1, ops, true); + + validate_memory(p, 0, page_size); + ops->cleanup_area(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has small page..."); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, page_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops) +{ + int wstatus; + void *p; + + p = alloc_hpage(ops); + printf("Share huge page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + + printf("Split huge page PMD in child process..."); + madvise(p, page_size, MADV_NOHUGEPAGE); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + ops->fault(p, 0, page_size); + + write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); + c->collapse("Collapse PTE table full of compound pages in child", + p, 1, ops, true); + write_num("khugepaged/max_ptes_shared", + current_settings()->khugepaged.max_ptes_shared); + + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has huge page..."); + if (ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops) +{ + int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); + int wstatus; + void *p; + + p = alloc_hpage(ops); + printf("Share huge page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + c->collapse("Maybe collapse with max_ptes_shared exceeded", p, + 1, ops, !c->enforce_pte_scan_limits); + + if (c->enforce_pte_scan_limits) { + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) * + page_size); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse with max_ptes_shared PTEs shared", + p, 1, ops, true); + } + + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has huge page..."); + if (ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void madvise_collapse_existing_thps(struct collapse_context *c, + struct mem_ops *ops) +{ + void *p; + + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); + c->collapse("Collapse fully populated PTE table...", p, 1, ops, true); + validate_memory(p, 0, hpage_pmd_size); + + /* c->collapse() will find a hugepage and complain - call directly. */ + __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true); + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +/* + * Test race with khugepaged where page tables have been retracted and + * pmd cleared. + */ +static void madvise_retracted_page_tables(struct collapse_context *c, + struct mem_ops *ops) +{ + void *p; + int nr_hpages = 1; + unsigned long size = nr_hpages * hpage_pmd_size; + + p = ops->setup_area(nr_hpages); + ops->fault(p, 0, size); + + /* Let khugepaged collapse and leave pmd cleared */ + if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages, + ops)) { + fail("Timeout"); + return; + } + success("OK"); + c->collapse("Install huge PMD from page cache", p, nr_hpages, ops, + true); + validate_memory(p, 0, size); + ops->cleanup_area(p, size); +} + +static void usage(void) +{ + fprintf(stderr, "\nUsage: ./khugepaged [dir]\n\n"); + fprintf(stderr, "\t\t: :\n"); + fprintf(stderr, "\t\t: [all|khugepaged|madvise]\n"); + fprintf(stderr, "\t\t: [all|anon|file|shmem]\n"); + fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); + fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); + fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); + fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); + fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n"); + exit(1); +} + +static void parse_test_type(int argc, const char **argv) +{ + char *buf; + const char *token; + + if (argc == 1) { + /* Backwards compatibility */ + khugepaged_context = &__khugepaged_context; + madvise_context = &__madvise_context; + anon_ops = &__anon_ops; + return; + } + + buf = strdup(argv[1]); + token = strsep(&buf, ":"); + + if (!strcmp(token, "all")) { + khugepaged_context = &__khugepaged_context; + madvise_context = &__madvise_context; + } else if (!strcmp(token, "khugepaged")) { + khugepaged_context = &__khugepaged_context; + } else if (!strcmp(token, "madvise")) { + madvise_context = &__madvise_context; + } else { + usage(); + } + + if (!buf) + usage(); + + if (!strcmp(buf, "all")) { + file_ops = &__file_ops; + anon_ops = &__anon_ops; + shmem_ops = &__shmem_ops; + } else if (!strcmp(buf, "anon")) { + anon_ops = &__anon_ops; + } else if (!strcmp(buf, "file")) { + file_ops = &__file_ops; + } else if (!strcmp(buf, "shmem")) { + shmem_ops = &__shmem_ops; + } else { + usage(); + } + + if (!file_ops) + return; + + if (argc != 3) + usage(); +} + +int main(int argc, const char **argv) +{ + struct settings default_settings = { + .thp_enabled = THP_MADVISE, + .thp_defrag = THP_DEFRAG_ALWAYS, + .shmem_enabled = SHMEM_ADVISE, + .use_zero_page = 0, + .khugepaged = { + .defrag = 1, + .alloc_sleep_millisecs = 10, + .scan_sleep_millisecs = 10, + }, + /* + * When testing file-backed memory, the collapse path + * looks at how many pages are found in the page cache, not + * what pages are mapped. Disable read ahead optimization so + * pages don't find their way into the page cache unless + * we mem_ops->fault() them in. + */ + .read_ahead_kb = 0, + }; + + parse_test_type(argc, argv); + + if (file_ops) + get_finfo(argv[2]); + + setbuf(stdout, NULL); + + page_size = getpagesize(); + hpage_pmd_size = read_pmd_pagesize(); + hpage_pmd_nr = hpage_pmd_size / page_size; + + default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; + default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8; + default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; + default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; + + save_settings(); + push_settings(&default_settings); + + alloc_at_fault(); + +#define TEST(t, c, o) do { \ + if (c && o) { \ + printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \ + t(c, o); \ + } \ + } while (0) + + TEST(collapse_full, khugepaged_context, anon_ops); + TEST(collapse_full, khugepaged_context, file_ops); + TEST(collapse_full, khugepaged_context, shmem_ops); + TEST(collapse_full, madvise_context, anon_ops); + TEST(collapse_full, madvise_context, file_ops); + TEST(collapse_full, madvise_context, shmem_ops); + + TEST(collapse_empty, khugepaged_context, anon_ops); + TEST(collapse_empty, madvise_context, anon_ops); + + TEST(collapse_single_pte_entry, khugepaged_context, anon_ops); + TEST(collapse_single_pte_entry, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops); + TEST(collapse_single_pte_entry, madvise_context, anon_ops); + TEST(collapse_single_pte_entry, madvise_context, file_ops); + TEST(collapse_single_pte_entry, madvise_context, shmem_ops); + + TEST(collapse_max_ptes_none, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_none, khugepaged_context, file_ops); + TEST(collapse_max_ptes_none, madvise_context, anon_ops); + TEST(collapse_max_ptes_none, madvise_context, file_ops); + + TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops); + TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops); + TEST(collapse_single_pte_entry_compound, madvise_context, file_ops); + + TEST(collapse_full_of_compound, khugepaged_context, anon_ops); + TEST(collapse_full_of_compound, khugepaged_context, file_ops); + TEST(collapse_full_of_compound, khugepaged_context, shmem_ops); + TEST(collapse_full_of_compound, madvise_context, anon_ops); + TEST(collapse_full_of_compound, madvise_context, file_ops); + TEST(collapse_full_of_compound, madvise_context, shmem_ops); + + TEST(collapse_compound_extreme, khugepaged_context, anon_ops); + TEST(collapse_compound_extreme, madvise_context, anon_ops); + + TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops); + TEST(collapse_swapin_single_pte, madvise_context, anon_ops); + + TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_swap, madvise_context, anon_ops); + + TEST(collapse_fork, khugepaged_context, anon_ops); + TEST(collapse_fork, madvise_context, anon_ops); + + TEST(collapse_fork_compound, khugepaged_context, anon_ops); + TEST(collapse_fork_compound, madvise_context, anon_ops); + + TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_shared, madvise_context, anon_ops); + + TEST(madvise_collapse_existing_thps, madvise_context, anon_ops); + TEST(madvise_collapse_existing_thps, madvise_context, file_ops); + TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops); + + TEST(madvise_retracted_page_tables, madvise_context, file_ops); + TEST(madvise_retracted_page_tables, madvise_context, shmem_ops); + + restore_settings(0); +} diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c new file mode 100644 index 000000000000..d8b5b4930412 --- /dev/null +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KSM functional tests + * + * Copyright 2022, Red Hat, Inc. + * + * Author(s): David Hildenbrand + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" +#include "vm_util.h" + +#define KiB 1024u +#define MiB (1024 * KiB) + +static int ksm_fd; +static int ksm_full_scans_fd; +static int pagemap_fd; +static size_t pagesize; + +static bool range_maps_duplicates(char *addr, unsigned long size) +{ + unsigned long offs_a, offs_b, pfn_a, pfn_b; + + /* + * There is no easy way to check if there are KSM pages mapped into + * this range. We only check that the range does not map the same PFN + * twice by comparing each pair of mapped pages. + */ + for (offs_a = 0; offs_a < size; offs_a += pagesize) { + pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a); + /* Page not present or PFN not exposed by the kernel. */ + if (pfn_a == -1ul || !pfn_a) + continue; + + for (offs_b = offs_a + pagesize; offs_b < size; + offs_b += pagesize) { + pfn_b = pagemap_get_pfn(pagemap_fd, addr + offs_b); + if (pfn_b == -1ul || !pfn_b) + continue; + if (pfn_a == pfn_b) + return true; + } + } + return false; +} + +static long ksm_get_full_scans(void) +{ + char buf[10]; + ssize_t ret; + + ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0); + if (ret <= 0) + return -errno; + buf[ret] = 0; + + return strtol(buf, NULL, 10); +} + +static int ksm_merge(void) +{ + long start_scans, end_scans; + + /* Wait for two full scans such that any possible merging happened. */ + start_scans = ksm_get_full_scans(); + if (start_scans < 0) + return start_scans; + if (write(ksm_fd, "1", 1) != 1) + return -errno; + do { + end_scans = ksm_get_full_scans(); + if (end_scans < 0) + return end_scans; + } while (end_scans < start_scans + 2); + + return 0; +} + +static char *mmap_and_merge_range(char val, unsigned long size) +{ + char *map; + + map = mmap(NULL, size, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANON, -1, 0); + if (map == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return MAP_FAILED; + } + + /* Don't use THP. Ignore if THP are not around on a kernel. */ + if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) { + ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); + goto unmap; + } + + /* Make sure each page contains the same values to merge them. */ + memset(map, val, size); + if (madvise(map, size, MADV_MERGEABLE)) { + ksft_test_result_fail("MADV_MERGEABLE failed\n"); + goto unmap; + } + + /* Run KSM to trigger merging and wait. */ + if (ksm_merge()) { + ksft_test_result_fail("Running KSM failed\n"); + goto unmap; + } + return map; +unmap: + munmap(map, size); + return MAP_FAILED; +} + +static void test_unmerge(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0xcf, size); + if (map == MAP_FAILED) + return; + + if (madvise(map, size, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto unmap; + } + + ksft_test_result(!range_maps_duplicates(map, size), + "Pages were unmerged\n"); +unmap: + munmap(map, size); +} + +static void test_unmerge_discarded(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0xcf, size); + if (map == MAP_FAILED) + return; + + /* Discard half of all mapped pages so we have pte_none() entries. */ + if (madvise(map, size / 2, MADV_DONTNEED)) { + ksft_test_result_fail("MADV_DONTNEED failed\n"); + goto unmap; + } + + if (madvise(map, size, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto unmap; + } + + ksft_test_result(!range_maps_duplicates(map, size), + "Pages were unmerged\n"); +unmap: + munmap(map, size); +} + +#ifdef __NR_userfaultfd +static void test_unmerge_uffd_wp(void) +{ + struct uffdio_writeprotect uffd_writeprotect; + struct uffdio_register uffdio_register; + const unsigned int size = 2 * MiB; + struct uffdio_api uffdio_api; + char *map; + int uffd; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0xcf, size); + if (map == MAP_FAILED) + return; + + /* See if UFFD is around. */ + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd < 0) { + ksft_test_result_skip("__NR_userfaultfd failed\n"); + goto unmap; + } + + /* See if UFFD-WP is around. */ + uffdio_api.api = UFFD_API; + uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) { + ksft_test_result_fail("UFFDIO_API failed\n"); + goto close_uffd; + } + if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) { + ksft_test_result_skip("UFFD_FEATURE_PAGEFAULT_FLAG_WP not available\n"); + goto close_uffd; + } + + /* Register UFFD-WP, no need for an actual handler. */ + uffdio_register.range.start = (unsigned long) map; + uffdio_register.range.len = size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) { + ksft_test_result_fail("UFFDIO_REGISTER_MODE_WP failed\n"); + goto close_uffd; + } + + /* Write-protect the range using UFFD-WP. */ + uffd_writeprotect.range.start = (unsigned long) map; + uffd_writeprotect.range.len = size; + uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP; + if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { + ksft_test_result_fail("UFFDIO_WRITEPROTECT failed\n"); + goto close_uffd; + } + + if (madvise(map, size, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto close_uffd; + } + + ksft_test_result(!range_maps_duplicates(map, size), + "Pages were unmerged\n"); +close_uffd: + close(uffd); +unmap: + munmap(map, size); +} +#endif + +int main(int argc, char **argv) +{ + unsigned int tests = 2; + int err; + +#ifdef __NR_userfaultfd + tests++; +#endif + + ksft_print_header(); + ksft_set_plan(tests); + + pagesize = getpagesize(); + + ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); + if (ksm_fd < 0) + ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n"); + ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY); + if (ksm_full_scans_fd < 0) + ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n"); + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); + + test_unmerge(); + test_unmerge_discarded(); +#ifdef __NR_userfaultfd + test_unmerge_uffd_wp(); +#endif + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + return ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c new file mode 100644 index 000000000000..f9eb4d67e0dd --- /dev/null +++ b/tools/testing/selftests/mm/ksm_tests.c @@ -0,0 +1,849 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" +#include +#include "util.h" + +#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/" +#define KSM_FP(s) (KSM_SYSFS_PATH s) +#define KSM_SCAN_LIMIT_SEC_DEFAULT 120 +#define KSM_PAGE_COUNT_DEFAULT 10l +#define KSM_PROT_STR_DEFAULT "rw" +#define KSM_USE_ZERO_PAGES_DEFAULT false +#define KSM_MERGE_ACROSS_NODES_DEFAULT true +#define MB (1ul << 20) + +struct ksm_sysfs { + unsigned long max_page_sharing; + unsigned long merge_across_nodes; + unsigned long pages_to_scan; + unsigned long run; + unsigned long sleep_millisecs; + unsigned long stable_node_chains_prune_millisecs; + unsigned long use_zero_pages; +}; + +enum ksm_test_name { + CHECK_KSM_MERGE, + CHECK_KSM_UNMERGE, + CHECK_KSM_ZERO_PAGE_MERGE, + CHECK_KSM_NUMA_MERGE, + KSM_MERGE_TIME, + KSM_MERGE_TIME_HUGE_PAGES, + KSM_UNMERGE_TIME, + KSM_COW_TIME +}; + +static int ksm_write_sysfs(const char *file_path, unsigned long val) +{ + FILE *f = fopen(file_path, "w"); + + if (!f) { + fprintf(stderr, "f %s\n", file_path); + perror("fopen"); + return 1; + } + if (fprintf(f, "%lu", val) < 0) { + perror("fprintf"); + fclose(f); + return 1; + } + fclose(f); + + return 0; +} + +static int ksm_read_sysfs(const char *file_path, unsigned long *val) +{ + FILE *f = fopen(file_path, "r"); + + if (!f) { + fprintf(stderr, "f %s\n", file_path); + perror("fopen"); + return 1; + } + if (fscanf(f, "%lu", val) != 1) { + perror("fscanf"); + fclose(f); + return 1; + } + fclose(f); + + return 0; +} + +static int str_to_prot(char *prot_str) +{ + int prot = 0; + + if ((strchr(prot_str, 'r')) != NULL) + prot |= PROT_READ; + if ((strchr(prot_str, 'w')) != NULL) + prot |= PROT_WRITE; + if ((strchr(prot_str, 'x')) != NULL) + prot |= PROT_EXEC; + + return prot; +} + +static void print_help(void) +{ + printf("usage: ksm_tests [-h] [-a prot] [-p page_count] [-l timeout]\n" + "[-z use_zero_pages] [-m merge_across_nodes] [-s size]\n"); + + printf("Supported :\n" + " -M (page merging)\n" + " -Z (zero pages merging)\n" + " -N (merging of pages in different NUMA nodes)\n" + " -U (page unmerging)\n" + " -P evaluate merging time and speed.\n" + " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n" + " -H evaluate merging time and speed of area allocated mostly with huge pages\n" + " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n" + " -D evaluate unmerging time and speed when disabling KSM.\n" + " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n" + " -C evaluate the time required to break COW of merged pages.\n\n"); + + printf(" -a: specify the access protections of pages.\n" + " must be of the form [rwx].\n" + " Default: %s\n", KSM_PROT_STR_DEFAULT); + printf(" -p: specify the number of pages to test.\n" + " Default: %ld\n", KSM_PAGE_COUNT_DEFAULT); + printf(" -l: limit the maximum running time (in seconds) for a test.\n" + " Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT); + printf(" -z: change use_zero_pages tunable\n" + " Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT); + printf(" -m: change merge_across_nodes tunable\n" + " Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT); + printf(" -s: the size of duplicated memory area (in MiB)\n"); + + exit(0); +} + +static void *allocate_memory(void *ptr, int prot, int mapping, char data, size_t map_size) +{ + void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0); + + if (!map_ptr) { + perror("mmap"); + return NULL; + } + memset(map_ptr, data, map_size); + if (mprotect(map_ptr, map_size, prot)) { + perror("mprotect"); + munmap(map_ptr, map_size); + return NULL; + } + + return map_ptr; +} + +static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout) +{ + struct timespec cur_time; + unsigned long cur_scan, init_scan; + + if (ksm_read_sysfs(KSM_FP("full_scans"), &init_scan)) + return 1; + cur_scan = init_scan; + + while (cur_scan < init_scan + scan_count) { + if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan)) + return 1; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) { + perror("clock_gettime"); + return 1; + } + if ((cur_time.tv_sec - start_time.tv_sec) > timeout) { + printf("Scan time limit exceeded\n"); + return 1; + } + } + + return 0; +} + +static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, int timeout) +{ + if (madvise(addr, size, MADV_MERGEABLE)) { + perror("madvise"); + return 1; + } + if (ksm_write_sysfs(KSM_FP("run"), 1)) + return 1; + + /* Since merging occurs only after 2 scans, make sure to get at least 2 full scans */ + if (ksm_do_scan(2, start_time, timeout)) + return 1; + + return 0; +} + +static int ksm_unmerge_pages(void *addr, size_t size, + struct timespec start_time, int timeout) +{ + if (madvise(addr, size, MADV_UNMERGEABLE)) { + perror("madvise"); + return 1; + } + return 0; +} + +static bool assert_ksm_pages_count(long dupl_page_count) +{ + unsigned long max_page_sharing, pages_sharing, pages_shared; + + if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) || + ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) || + ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing)) + return false; + + /* + * Since there must be at least 2 pages for merging and 1 page can be + * shared with the limited number of pages (max_page_sharing), sometimes + * there are 'leftover' pages that cannot be merged. For example, if there + * are 11 pages and max_page_sharing = 10, then only 10 pages will be + * merged and the 11th page won't be affected. As a result, when the number + * of duplicate pages is divided by max_page_sharing and the remainder is 1, + * pages_shared and pages_sharing values will be equal between dupl_page_count + * and dupl_page_count - 1. + */ + if (dupl_page_count % max_page_sharing == 1 || dupl_page_count % max_page_sharing == 0) { + if (pages_shared == dupl_page_count / max_page_sharing && + pages_sharing == pages_shared * (max_page_sharing - 1)) + return true; + } else { + if (pages_shared == (dupl_page_count / max_page_sharing + 1) && + pages_sharing == dupl_page_count - pages_shared) + return true; + } + + return false; +} + +static int ksm_save_def(struct ksm_sysfs *ksm_sysfs) +{ + if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) || + numa_available() ? 0 : + ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) || + ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) || + ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) || + ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) || + ksm_read_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), + &ksm_sysfs->stable_node_chains_prune_millisecs) || + ksm_read_sysfs(KSM_FP("use_zero_pages"), &ksm_sysfs->use_zero_pages)) + return 1; + + return 0; +} + +static int ksm_restore(struct ksm_sysfs *ksm_sysfs) +{ + if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) || + numa_available() ? 0 : + ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) || + ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) || + ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) || + ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) || + ksm_write_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), + ksm_sysfs->stable_node_chains_prune_millisecs) || + ksm_write_sysfs(KSM_FP("use_zero_pages"), ksm_sysfs->use_zero_pages)) + return 1; + + return 0; +} + +static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + /* fill pages with the same data and merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* verify that the right number of pages are merged */ + if (assert_ksm_pages_count(page_count)) { + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + } + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + int page_count = 2; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + /* fill pages with the same data and merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */ + memset(map_ptr, '-', 1); + memset(map_ptr + page_size, '+', 1); + + /* get at least 1 scan, so KSM can detect that the pages were modified */ + if (ksm_do_scan(1, start_time, timeout)) + goto err_out; + + /* check that unmerging was successful and 0 pages are currently merged */ + if (assert_ksm_pages_count(0)) { + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + } + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int timeout, + bool use_zero_pages, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + if (ksm_write_sysfs(KSM_FP("use_zero_pages"), use_zero_pages)) + return KSFT_FAIL; + + /* fill pages with zero and try to merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, 0, page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* + * verify that the right number of pages are merged: + * 1) if use_zero_pages is set to 1, empty pages are merged + * with the kernel zero page instead of with each other; + * 2) if use_zero_pages is set to 0, empty pages are not treated specially + * and merged as usual. + */ + if (use_zero_pages && !assert_ksm_pages_count(0)) + goto err_out; + else if (!use_zero_pages && !assert_ksm_pages_count(page_count)) + goto err_out; + + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +static int get_next_mem_node(int node) +{ + + long node_size; + int mem_node = 0; + int i, max_node = numa_max_node(); + + for (i = node + 1; i <= max_node + node; i++) { + mem_node = i % (max_node + 1); + node_size = numa_node_size(mem_node, NULL); + if (node_size > 0) + break; + } + return mem_node; +} + +static int get_first_mem_node(void) +{ + return get_next_mem_node(numa_max_node()); +} + +static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes, + size_t page_size) +{ + void *numa1_map_ptr, *numa2_map_ptr; + struct timespec start_time; + int page_count = 2; + int first_node; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + if (numa_available() < 0) { + perror("NUMA support not enabled"); + return KSFT_SKIP; + } + if (numa_num_configured_nodes() <= 1) { + printf("At least 2 NUMA nodes must be available\n"); + return KSFT_SKIP; + } + if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes)) + return KSFT_FAIL; + + /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */ + first_node = get_first_mem_node(); + numa1_map_ptr = numa_alloc_onnode(page_size, first_node); + numa2_map_ptr = numa_alloc_onnode(page_size, get_next_mem_node(first_node)); + if (!numa1_map_ptr || !numa2_map_ptr) { + perror("numa_alloc_onnode"); + return KSFT_FAIL; + } + + memset(numa1_map_ptr, '*', page_size); + memset(numa2_map_ptr, '*', page_size); + + /* try to merge the pages */ + if (ksm_merge_pages(numa1_map_ptr, page_size, start_time, timeout) || + ksm_merge_pages(numa2_map_ptr, page_size, start_time, timeout)) + goto err_out; + + /* + * verify that the right number of pages are merged: + * 1) if merge_across_nodes was enabled, 2 duplicate pages will be merged; + * 2) if merge_across_nodes = 0, there must be 0 merged pages, since there is + * only 1 unique page in each node and they can't be shared. + */ + if (merge_across_nodes && !assert_ksm_pages_count(page_count)) + goto err_out; + else if (!merge_across_nodes && !assert_ksm_pages_count(0)) + goto err_out; + + numa_free(numa1_map_ptr, page_size); + numa_free(numa2_map_ptr, page_size); + printf("OK\n"); + return KSFT_PASS; + +err_out: + numa_free(numa1_map_ptr, page_size); + numa_free(numa2_map_ptr, page_size); + printf("Not OK\n"); + return KSFT_FAIL; +} + +static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t map_size) +{ + void *map_ptr, *map_ptr_orig; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + int pagemap_fd, n_normal_pages, n_huge_pages; + + map_size *= MB; + size_t len = map_size; + + len -= len % HPAGE_SIZE; + map_ptr_orig = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0); + map_ptr = map_ptr_orig + HPAGE_SIZE - (uintptr_t)map_ptr_orig % HPAGE_SIZE; + + if (map_ptr_orig == MAP_FAILED) + err(2, "initial mmap"); + + if (madvise(map_ptr, len + HPAGE_SIZE, MADV_HUGEPAGE)) + err(2, "MADV_HUGEPAGE"); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + err(2, "open pagemap"); + + n_normal_pages = 0; + n_huge_pages = 0; + for (void *p = map_ptr; p < map_ptr + len; p += HPAGE_SIZE) { + if (allocate_transhuge(p, pagemap_fd) < 0) + n_normal_pages++; + else + n_huge_pages++; + } + printf("Number of normal pages: %d\n", n_normal_pages); + printf("Number of huge pages: %d\n", n_huge_pages); + + memset(map_ptr, '*', len); + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr_orig, len + HPAGE_SIZE); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr_orig, len + HPAGE_SIZE); + return KSFT_FAIL; +} + +static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + + map_size *= MB; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); + if (!map_ptr) + return KSFT_FAIL; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, map_size); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, map_size); + return KSFT_FAIL; +} + +static int ksm_unmerge_time(int mapping, int prot, int timeout, size_t map_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + + map_size *= MB; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); + if (!map_ptr) + return KSFT_FAIL; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, map_size); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, map_size); + return KSFT_FAIL; +} + +static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long cow_time_ns; + + /* page_count must be less than 2*page_size */ + size_t page_count = 4000; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + for (size_t i = 0; i < page_count - 1; i = i + 2) + memset(map_ptr + page_size * i, '-', 1); + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n\n", (page_size * page_count) / MB); + printf("Not merged pages:\n"); + printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, + cow_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) / + ((double)cow_time_ns / NSEC_PER_SEC)); + + /* Create 2000 pairs of duplicate pages */ + for (size_t i = 0; i < page_count - 1; i = i + 2) { + memset(map_ptr + page_size * i, '+', i / 2 + 1); + memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1); + } + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + for (size_t i = 0; i < page_count - 1; i = i + 2) + memset(map_ptr + page_size * i, '-', 1); + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Merged pages:\n"); + printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, + cow_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) / + ((double)cow_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +int main(int argc, char *argv[]) +{ + int ret, opt; + int prot = 0; + int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT; + long page_count = KSM_PAGE_COUNT_DEFAULT; + size_t page_size = sysconf(_SC_PAGESIZE); + struct ksm_sysfs ksm_sysfs_old; + int test_name = CHECK_KSM_MERGE; + bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT; + bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; + long size_MB = 0; + + while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCHD")) != -1) { + switch (opt) { + case 'a': + prot = str_to_prot(optarg); + break; + case 'p': + page_count = atol(optarg); + if (page_count <= 0) { + printf("The number of pages must be greater than 0\n"); + return KSFT_FAIL; + } + break; + case 'l': + ksm_scan_limit_sec = atoi(optarg); + if (ksm_scan_limit_sec <= 0) { + printf("Timeout value must be greater than 0\n"); + return KSFT_FAIL; + } + break; + case 'h': + print_help(); + break; + case 'z': + if (strcmp(optarg, "0") == 0) + use_zero_pages = 0; + else + use_zero_pages = 1; + break; + case 'm': + if (strcmp(optarg, "0") == 0) + merge_across_nodes = 0; + else + merge_across_nodes = 1; + break; + case 's': + size_MB = atoi(optarg); + if (size_MB <= 0) { + printf("Size must be greater than 0\n"); + return KSFT_FAIL; + } + case 'M': + break; + case 'U': + test_name = CHECK_KSM_UNMERGE; + break; + case 'Z': + test_name = CHECK_KSM_ZERO_PAGE_MERGE; + break; + case 'N': + test_name = CHECK_KSM_NUMA_MERGE; + break; + case 'P': + test_name = KSM_MERGE_TIME; + break; + case 'H': + test_name = KSM_MERGE_TIME_HUGE_PAGES; + break; + case 'D': + test_name = KSM_UNMERGE_TIME; + break; + case 'C': + test_name = KSM_COW_TIME; + break; + default: + return KSFT_FAIL; + } + } + + if (prot == 0) + prot = str_to_prot(KSM_PROT_STR_DEFAULT); + + if (access(KSM_SYSFS_PATH, F_OK)) { + printf("Config KSM not enabled\n"); + return KSFT_SKIP; + } + + if (ksm_save_def(&ksm_sysfs_old)) { + printf("Cannot save default tunables\n"); + return KSFT_FAIL; + } + + if (ksm_write_sysfs(KSM_FP("run"), 2) || + ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) || + numa_available() ? 0 : + ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) || + ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count)) + return KSFT_FAIL; + + switch (test_name) { + case CHECK_KSM_MERGE: + ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, + ksm_scan_limit_sec, page_size); + break; + case CHECK_KSM_UNMERGE: + ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + page_size); + break; + case CHECK_KSM_ZERO_PAGE_MERGE: + ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, + ksm_scan_limit_sec, use_zero_pages, page_size); + break; + case CHECK_KSM_NUMA_MERGE: + ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + merge_across_nodes, page_size); + break; + case KSM_MERGE_TIME: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + size_MB); + break; + case KSM_MERGE_TIME_HUGE_PAGES: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, size_MB); + break; + case KSM_UNMERGE_TIME: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_unmerge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, size_MB); + break; + case KSM_COW_TIME: + ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + page_size); + break; + } + + if (ksm_restore(&ksm_sysfs_old)) { + printf("Cannot restore default tunables\n"); + return KSFT_FAIL; + } + + return ret; +} diff --git a/tools/testing/selftests/mm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c new file mode 100644 index 000000000000..262eae6b58f2 --- /dev/null +++ b/tools/testing/selftests/mm/madv_populate.c @@ -0,0 +1,296 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * MADV_POPULATE_READ and MADV_POPULATE_WRITE tests + * + * Copyright 2021, Red Hat, Inc. + * + * Author(s): David Hildenbrand + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" +#include "vm_util.h" + +#ifndef MADV_POPULATE_READ +#define MADV_POPULATE_READ 22 +#endif /* MADV_POPULATE_READ */ +#ifndef MADV_POPULATE_WRITE +#define MADV_POPULATE_WRITE 23 +#endif /* MADV_POPULATE_WRITE */ + +/* + * For now, we're using 2 MiB of private anonymous memory for all tests. + */ +#define SIZE (2 * 1024 * 1024) + +static size_t pagesize; + +static void sense_support(void) +{ + char *addr; + int ret; + + addr = mmap(0, pagesize, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (!addr) + ksft_exit_fail_msg("mmap failed\n"); + + ret = madvise(addr, pagesize, MADV_POPULATE_READ); + if (ret) + ksft_exit_skip("MADV_POPULATE_READ is not available\n"); + + ret = madvise(addr, pagesize, MADV_POPULATE_WRITE); + if (ret) + ksft_exit_skip("MADV_POPULATE_WRITE is not available\n"); + + munmap(addr, pagesize); +} + +static void test_prot_read(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(!ret, "MADV_POPULATE_READ with PROT_READ\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == EINVAL, + "MADV_POPULATE_WRITE with PROT_READ\n"); + + munmap(addr, SIZE); +} + +static void test_prot_write(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == EINVAL, + "MADV_POPULATE_READ with PROT_WRITE\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(!ret, "MADV_POPULATE_WRITE with PROT_WRITE\n"); + + munmap(addr, SIZE); +} + +static void test_holes(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + ret = munmap(addr + pagesize, pagesize); + if (ret) + ksft_exit_fail_msg("munmap failed\n"); + + /* Hole in the middle */ + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_READ with holes in the middle\n"); + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_WRITE with holes in the middle\n"); + + /* Hole at end */ + ret = madvise(addr, 2 * pagesize, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_READ with holes at the end\n"); + ret = madvise(addr, 2 * pagesize, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_WRITE with holes at the end\n"); + + /* Hole at beginning */ + ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_READ with holes at the beginning\n"); + ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_WRITE with holes at the beginning\n"); + + munmap(addr, SIZE); +} + +static bool range_is_populated(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (!pagemap_is_populated(fd, start)) + ret = false; + close(fd); + return ret; +} + +static bool range_is_not_populated(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (pagemap_is_populated(fd, start)) + ret = false; + close(fd); + return ret; +} + +static void test_populate_read(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + ksft_test_result(range_is_not_populated(addr, SIZE), + "range initially not populated\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(!ret, "MADV_POPULATE_READ\n"); + ksft_test_result(range_is_populated(addr, SIZE), + "range is populated\n"); + + munmap(addr, SIZE); +} + +static void test_populate_write(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + ksft_test_result(range_is_not_populated(addr, SIZE), + "range initially not populated\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(!ret, "MADV_POPULATE_WRITE\n"); + ksft_test_result(range_is_populated(addr, SIZE), + "range is populated\n"); + + munmap(addr, SIZE); +} + +static bool range_is_softdirty(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (!pagemap_is_softdirty(fd, start)) + ret = false; + close(fd); + return ret; +} + +static bool range_is_not_softdirty(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (pagemap_is_softdirty(fd, start)) + ret = false; + close(fd); + return ret; +} + +static void test_softdirty(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* Clear any softdirty bits. */ + clear_softdirty(); + ksft_test_result(range_is_not_softdirty(addr, SIZE), + "range is not softdirty\n"); + + /* Populating READ should set softdirty. */ + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(!ret, "MADV_POPULATE_READ\n"); + ksft_test_result(range_is_not_softdirty(addr, SIZE), + "range is not softdirty\n"); + + /* Populating WRITE should set softdirty. */ + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(!ret, "MADV_POPULATE_WRITE\n"); + ksft_test_result(range_is_softdirty(addr, SIZE), + "range is softdirty\n"); + + munmap(addr, SIZE); +} + +int main(int argc, char **argv) +{ + int err; + + pagesize = getpagesize(); + + ksft_print_header(); + ksft_set_plan(21); + + sense_support(); + test_prot_read(); + test_prot_write(); + test_holes(); + test_populate_read(); + test_populate_write(); + test_softdirty(); + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + return ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/map_fixed_noreplace.c b/tools/testing/selftests/mm/map_fixed_noreplace.c new file mode 100644 index 000000000000..eed44322d1a6 --- /dev/null +++ b/tools/testing/selftests/mm/map_fixed_noreplace.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Test that MAP_FIXED_NOREPLACE works. + * + * Copyright 2018, Jann Horn + * Copyright 2018, Michael Ellerman, IBM Corporation. + */ + +#include +#include +#include +#include +#include + +#ifndef MAP_FIXED_NOREPLACE +#define MAP_FIXED_NOREPLACE 0x100000 +#endif + +static void dump_maps(void) +{ + char cmd[32]; + + snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid()); + system(cmd); +} + +static unsigned long find_base_addr(unsigned long size) +{ + void *addr; + unsigned long flags; + + flags = MAP_PRIVATE | MAP_ANONYMOUS; + addr = mmap(NULL, size, PROT_NONE, flags, -1, 0); + if (addr == MAP_FAILED) { + printf("Error: couldn't map the space we need for the test\n"); + return 0; + } + + if (munmap(addr, size) != 0) { + printf("Error: couldn't map the space we need for the test\n"); + return 0; + } + return (unsigned long)addr; +} + +int main(void) +{ + unsigned long base_addr; + unsigned long flags, addr, size, page_size; + char *p; + + page_size = sysconf(_SC_PAGE_SIZE); + + //let's find a base addr that is free before we start the tests + size = 5 * page_size; + base_addr = find_base_addr(size); + if (!base_addr) { + printf("Error: couldn't map the space we need for the test\n"); + return 1; + } + + flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE; + + // Check we can map all the areas we need below + errno = 0; + addr = base_addr; + size = 5 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error: couldn't map the space we need for the test\n"); + return 1; + } + + errno = 0; + if (munmap((void *)addr, 5 * page_size) != 0) { + dump_maps(); + printf("Error: munmap failed!?\n"); + return 1; + } + printf("unmap() successful\n"); + + errno = 0; + addr = base_addr + page_size; + size = 3 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error: first mmap() failed unexpectedly\n"); + return 1; + } + + /* + * Exact same mapping again: + * base | free | new + * +1 | mapped | new + * +2 | mapped | new + * +3 | mapped | new + * +4 | free | new + */ + errno = 0; + addr = base_addr; + size = 5 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:1: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Second mapping contained within first: + * + * base | free | + * +1 | mapped | + * +2 | mapped | new + * +3 | mapped | + * +4 | free | + */ + errno = 0; + addr = base_addr + (2 * page_size); + size = page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:2: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Overlap end of existing mapping: + * base | free | + * +1 | mapped | + * +2 | mapped | + * +3 | mapped | new + * +4 | free | new + */ + errno = 0; + addr = base_addr + (3 * page_size); + size = 2 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:3: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Overlap start of existing mapping: + * base | free | new + * +1 | mapped | new + * +2 | mapped | + * +3 | mapped | + * +4 | free | + */ + errno = 0; + addr = base_addr; + size = 2 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:4: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Adjacent to start of existing mapping: + * base | free | new + * +1 | mapped | + * +2 | mapped | + * +3 | mapped | + * +4 | free | + */ + errno = 0; + addr = base_addr; + size = page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error:5: mmap() failed when it shouldn't have\n"); + return 1; + } + + /* + * Adjacent to end of existing mapping: + * base | free | + * +1 | mapped | + * +2 | mapped | + * +3 | mapped | + * +4 | free | new + */ + errno = 0; + addr = base_addr + (4 * page_size); + size = page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error:6: mmap() failed when it shouldn't have\n"); + return 1; + } + + addr = base_addr; + size = 5 * page_size; + if (munmap((void *)addr, size) != 0) { + dump_maps(); + printf("Error: munmap failed!?\n"); + return 1; + } + printf("unmap() successful\n"); + + printf("OK\n"); + return 0; +} diff --git a/tools/testing/selftests/mm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c new file mode 100644 index 000000000000..312889edb84a --- /dev/null +++ b/tools/testing/selftests/mm/map_hugetlb.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Example of using hugepage memory in a user application using the mmap + * system call with MAP_HUGETLB flag. Before running this program make + * sure the administrator has allocated enough default sized huge pages + * to cover the 256 MB allocation. + * + * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * That means the addresses starting with 0x800000... will need to be + * specified. Specifying a fixed address is not required on ppc64, i386 + * or x86_64. + */ +#include +#include +#include +#include +#include + +#define LENGTH (256UL*1024*1024) +#define PROTECTION (PROT_READ | PROT_WRITE) + +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0x40000 /* arch specific */ +#endif + +#ifndef MAP_HUGE_SHIFT +#define MAP_HUGE_SHIFT 26 +#endif + +#ifndef MAP_HUGE_MASK +#define MAP_HUGE_MASK 0x3f +#endif + +/* Only ia64 requires this */ +#ifdef __ia64__ +#define ADDR (void *)(0x8000000000000000UL) +#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) +#else +#define ADDR (void *)(0x0UL) +#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +#endif + +static void check_bytes(char *addr) +{ + printf("First hex is %x\n", *((unsigned int *)addr)); +} + +static void write_bytes(char *addr, size_t length) +{ + unsigned long i; + + for (i = 0; i < length; i++) + *(addr + i) = (char)i; +} + +static int read_bytes(char *addr, size_t length) +{ + unsigned long i; + + check_bytes(addr); + for (i = 0; i < length; i++) + if (*(addr + i) != (char)i) { + printf("Mismatch at %lu\n", i); + return 1; + } + return 0; +} + +int main(int argc, char **argv) +{ + void *addr; + int ret; + size_t length = LENGTH; + int flags = FLAGS; + int shift = 0; + + if (argc > 1) + length = atol(argv[1]) << 20; + if (argc > 2) { + shift = atoi(argv[2]); + if (shift) + flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + } + + if (shift) + printf("%u kB hugepages\n", 1 << (shift - 10)); + else + printf("Default size hugepages\n"); + printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20); + + addr = mmap(ADDR, length, PROTECTION, flags, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + printf("Returned address is %p\n", addr); + check_bytes(addr); + write_bytes(addr, length); + ret = read_bytes(addr, length); + + /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ + if (munmap(addr, length)) { + perror("munmap"); + exit(1); + } + + return ret; +} diff --git a/tools/testing/selftests/mm/map_populate.c b/tools/testing/selftests/mm/map_populate.c new file mode 100644 index 000000000000..6b8aeaa0bf7a --- /dev/null +++ b/tools/testing/selftests/mm/map_populate.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Dmitry Safonov, Arista Networks + * + * MAP_POPULATE | MAP_PRIVATE should COW VMA pages. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef MMAP_SZ +#define MMAP_SZ 4096 +#endif + +#define BUG_ON(condition, description) \ + do { \ + if (condition) { \ + fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \ + __LINE__, (description), strerror(errno)); \ + exit(1); \ + } \ + } while (0) + +static int parent_f(int sock, unsigned long *smap, int child) +{ + int status, ret; + + ret = read(sock, &status, sizeof(int)); + BUG_ON(ret <= 0, "read(sock)"); + + *smap = 0x22222BAD; + ret = msync(smap, MMAP_SZ, MS_SYNC); + BUG_ON(ret, "msync()"); + + ret = write(sock, &status, sizeof(int)); + BUG_ON(ret <= 0, "write(sock)"); + + waitpid(child, &status, 0); + BUG_ON(!WIFEXITED(status), "child in unexpected state"); + + return WEXITSTATUS(status); +} + +static int child_f(int sock, unsigned long *smap, int fd) +{ + int ret, buf = 0; + + smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_POPULATE, fd, 0); + BUG_ON(smap == MAP_FAILED, "mmap()"); + + BUG_ON(*smap != 0xdeadbabe, "MAP_PRIVATE | MAP_POPULATE changed file"); + + ret = write(sock, &buf, sizeof(int)); + BUG_ON(ret <= 0, "write(sock)"); + + ret = read(sock, &buf, sizeof(int)); + BUG_ON(ret <= 0, "read(sock)"); + + BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page"); + BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted"); + + return 0; +} + +int main(int argc, char **argv) +{ + int sock[2], child, ret; + FILE *ftmp; + unsigned long *smap; + + ftmp = tmpfile(); + BUG_ON(ftmp == 0, "tmpfile()"); + + ret = ftruncate(fileno(ftmp), MMAP_SZ); + BUG_ON(ret, "ftruncate()"); + + smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE, + MAP_SHARED, fileno(ftmp), 0); + BUG_ON(smap == MAP_FAILED, "mmap()"); + + *smap = 0xdeadbabe; + /* Probably unnecessary, but let it be. */ + ret = msync(smap, MMAP_SZ, MS_SYNC); + BUG_ON(ret, "msync()"); + + ret = socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sock); + BUG_ON(ret, "socketpair()"); + + child = fork(); + BUG_ON(child == -1, "fork()"); + + if (child) { + ret = close(sock[0]); + BUG_ON(ret, "close()"); + + return parent_f(sock[1], smap, child); + } + + ret = close(sock[1]); + BUG_ON(ret, "close()"); + + return child_f(sock[0], smap, fileno(ftmp)); +} diff --git a/tools/testing/selftests/mm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c new file mode 100644 index 000000000000..957b9e18c729 --- /dev/null +++ b/tools/testing/selftests/mm/memfd_secret.c @@ -0,0 +1,296 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corporation, 2021 + * + * Author: Mike Rapoport + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +#define fail(fmt, ...) ksft_test_result_fail(fmt, ##__VA_ARGS__) +#define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__) +#define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__) + +#ifdef __NR_memfd_secret + +#define PATTERN 0x55 + +static const int prot = PROT_READ | PROT_WRITE; +static const int mode = MAP_SHARED; + +static unsigned long page_size; +static unsigned long mlock_limit_cur; +static unsigned long mlock_limit_max; + +static int memfd_secret(unsigned int flags) +{ + return syscall(__NR_memfd_secret, flags); +} + +static void test_file_apis(int fd) +{ + char buf[64]; + + if ((read(fd, buf, sizeof(buf)) >= 0) || + (write(fd, buf, sizeof(buf)) >= 0) || + (pread(fd, buf, sizeof(buf), 0) >= 0) || + (pwrite(fd, buf, sizeof(buf), 0) >= 0)) + fail("unexpected file IO\n"); + else + pass("file IO is blocked as expected\n"); +} + +static void test_mlock_limit(int fd) +{ + size_t len; + char *mem; + + len = mlock_limit_cur; + mem = mmap(NULL, len, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("unable to mmap secret memory\n"); + return; + } + munmap(mem, len); + + len = mlock_limit_max * 2; + mem = mmap(NULL, len, prot, mode, fd, 0); + if (mem != MAP_FAILED) { + fail("unexpected mlock limit violation\n"); + munmap(mem, len); + return; + } + + pass("mlock limit is respected\n"); +} + +static void try_process_vm_read(int fd, int pipefd[2]) +{ + struct iovec liov, riov; + char buf[64]; + char *mem; + + if (read(pipefd[0], &mem, sizeof(mem)) < 0) { + fail("pipe write: %s\n", strerror(errno)); + exit(KSFT_FAIL); + } + + liov.iov_len = riov.iov_len = sizeof(buf); + liov.iov_base = buf; + riov.iov_base = mem; + + if (process_vm_readv(getppid(), &liov, 1, &riov, 1, 0) < 0) { + if (errno == ENOSYS) + exit(KSFT_SKIP); + exit(KSFT_PASS); + } + + exit(KSFT_FAIL); +} + +static void try_ptrace(int fd, int pipefd[2]) +{ + pid_t ppid = getppid(); + int status; + char *mem; + long ret; + + if (read(pipefd[0], &mem, sizeof(mem)) < 0) { + perror("pipe write"); + exit(KSFT_FAIL); + } + + ret = ptrace(PTRACE_ATTACH, ppid, 0, 0); + if (ret) { + perror("ptrace_attach"); + exit(KSFT_FAIL); + } + + ret = waitpid(ppid, &status, WUNTRACED); + if ((ret != ppid) || !(WIFSTOPPED(status))) { + fprintf(stderr, "weird waitppid result %ld stat %x\n", + ret, status); + exit(KSFT_FAIL); + } + + if (ptrace(PTRACE_PEEKDATA, ppid, mem, 0)) + exit(KSFT_PASS); + + exit(KSFT_FAIL); +} + +static void check_child_status(pid_t pid, const char *name) +{ + int status; + + waitpid(pid, &status, 0); + + if (WIFEXITED(status) && WEXITSTATUS(status) == KSFT_SKIP) { + skip("%s is not supported\n", name); + return; + } + + if ((WIFEXITED(status) && WEXITSTATUS(status) == KSFT_PASS) || + WIFSIGNALED(status)) { + pass("%s is blocked as expected\n", name); + return; + } + + fail("%s: unexpected memory access\n", name); +} + +static void test_remote_access(int fd, const char *name, + void (*func)(int fd, int pipefd[2])) +{ + int pipefd[2]; + pid_t pid; + char *mem; + + if (pipe(pipefd)) { + fail("pipe failed: %s\n", strerror(errno)); + return; + } + + pid = fork(); + if (pid < 0) { + fail("fork failed: %s\n", strerror(errno)); + return; + } + + if (pid == 0) { + func(fd, pipefd); + return; + } + + mem = mmap(NULL, page_size, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("Unable to mmap secret memory\n"); + return; + } + + ftruncate(fd, page_size); + memset(mem, PATTERN, page_size); + + if (write(pipefd[1], &mem, sizeof(mem)) < 0) { + fail("pipe write: %s\n", strerror(errno)); + return; + } + + check_child_status(pid, name); +} + +static void test_process_vm_read(int fd) +{ + test_remote_access(fd, "process_vm_read", try_process_vm_read); +} + +static void test_ptrace(int fd) +{ + test_remote_access(fd, "ptrace", try_ptrace); +} + +static int set_cap_limits(rlim_t max) +{ + struct rlimit new; + cap_t cap = cap_init(); + + new.rlim_cur = max; + new.rlim_max = max; + if (setrlimit(RLIMIT_MEMLOCK, &new)) { + perror("setrlimit() returns error"); + return -1; + } + + /* drop capabilities including CAP_IPC_LOCK */ + if (cap_set_proc(cap)) { + perror("cap_set_proc() returns error"); + return -2; + } + + return 0; +} + +static void prepare(void) +{ + struct rlimit rlim; + + page_size = sysconf(_SC_PAGE_SIZE); + if (!page_size) + ksft_exit_fail_msg("Failed to get page size %s\n", + strerror(errno)); + + if (getrlimit(RLIMIT_MEMLOCK, &rlim)) + ksft_exit_fail_msg("Unable to detect mlock limit: %s\n", + strerror(errno)); + + mlock_limit_cur = rlim.rlim_cur; + mlock_limit_max = rlim.rlim_max; + + printf("page_size: %ld, mlock.soft: %ld, mlock.hard: %ld\n", + page_size, mlock_limit_cur, mlock_limit_max); + + if (page_size > mlock_limit_cur) + mlock_limit_cur = page_size; + if (page_size > mlock_limit_max) + mlock_limit_max = page_size; + + if (set_cap_limits(mlock_limit_max)) + ksft_exit_fail_msg("Unable to set mlock limit: %s\n", + strerror(errno)); +} + +#define NUM_TESTS 4 + +int main(int argc, char *argv[]) +{ + int fd; + + prepare(); + + ksft_print_header(); + ksft_set_plan(NUM_TESTS); + + fd = memfd_secret(0); + if (fd < 0) { + if (errno == ENOSYS) + ksft_exit_skip("memfd_secret is not supported\n"); + else + ksft_exit_fail_msg("memfd_secret failed: %s\n", + strerror(errno)); + } + + test_mlock_limit(fd); + test_file_apis(fd); + test_process_vm_read(fd); + test_ptrace(fd); + + close(fd); + + ksft_finished(); +} + +#else /* __NR_memfd_secret */ + +int main(int argc, char *argv[]) +{ + printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n"); + return KSFT_SKIP; +} + +#endif /* __NR_memfd_secret */ diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c new file mode 100644 index 000000000000..1cec8425e3ca --- /dev/null +++ b/tools/testing/selftests/mm/migration.c @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The main purpose of the tests here is to exercise the migration entry code + * paths in the kernel. + */ + +#include "../kselftest_harness.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#define TWOMEG (2<<20) +#define RUNTIME (60) + +#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) + +FIXTURE(migration) +{ + pthread_t *threads; + pid_t *pids; + int nthreads; + int n1; + int n2; +}; + +FIXTURE_SETUP(migration) +{ + int n; + + ASSERT_EQ(numa_available(), 0); + self->nthreads = numa_num_task_cpus() - 1; + self->n1 = -1; + self->n2 = -1; + + for (n = 0; n < numa_max_possible_node(); n++) + if (numa_bitmask_isbitset(numa_all_nodes_ptr, n)) { + if (self->n1 == -1) { + self->n1 = n; + } else { + self->n2 = n; + break; + } + } + + self->threads = malloc(self->nthreads * sizeof(*self->threads)); + ASSERT_NE(self->threads, NULL); + self->pids = malloc(self->nthreads * sizeof(*self->pids)); + ASSERT_NE(self->pids, NULL); +}; + +FIXTURE_TEARDOWN(migration) +{ + free(self->threads); + free(self->pids); +} + +int migrate(uint64_t *ptr, int n1, int n2) +{ + int ret, tmp; + int status = 0; + struct timespec ts1, ts2; + + if (clock_gettime(CLOCK_MONOTONIC, &ts1)) + return -1; + + while (1) { + if (clock_gettime(CLOCK_MONOTONIC, &ts2)) + return -1; + + if (ts2.tv_sec - ts1.tv_sec >= RUNTIME) + return 0; + + ret = move_pages(0, 1, (void **) &ptr, &n2, &status, + MPOL_MF_MOVE_ALL); + if (ret) { + if (ret > 0) + printf("Didn't migrate %d pages\n", ret); + else + perror("Couldn't migrate pages"); + return -2; + } + + tmp = n2; + n2 = n1; + n1 = tmp; + } + + return 0; +} + +void *access_mem(void *ptr) +{ + uint64_t y = 0; + volatile uint64_t *x = ptr; + + while (1) { + pthread_testcancel(); + y += *x; + } + + return NULL; +} + +/* + * Basic migration entry testing. One thread will move pages back and forth + * between nodes whilst other threads try and access them triggering the + * migration entry wait paths in the kernel. + */ +TEST_F_TIMEOUT(migration, private_anon, 2*RUNTIME) +{ + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) + if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) + perror("Couldn't create thread"); + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(pthread_cancel(self->threads[i]), 0); +} + +/* + * Same as the previous test but with shared memory. + */ +TEST_F_TIMEOUT(migration, shared_anon, 2*RUNTIME) +{ + pid_t pid; + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) { + pid = fork(); + if (!pid) + access_mem(ptr); + else + self->pids[i] = pid; + } + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(kill(self->pids[i], SIGTERM), 0); +} + +/* + * Tests the pmd migration entry paths. + */ +TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME) +{ + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, 2*TWOMEG, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG); + ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0); + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) + if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) + perror("Couldn't create thread"); + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(pthread_cancel(self->threads[i]), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/mlock-random-test.c b/tools/testing/selftests/mm/mlock-random-test.c new file mode 100644 index 000000000000..782ea94dee2f --- /dev/null +++ b/tools/testing/selftests/mm/mlock-random-test.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * It tests the mlock/mlock2() when they are invoked + * on randomly memory region. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mlock2.h" + +#define CHUNK_UNIT (128 * 1024) +#define MLOCK_RLIMIT_SIZE (CHUNK_UNIT * 2) +#define MLOCK_WITHIN_LIMIT_SIZE CHUNK_UNIT +#define MLOCK_OUTOF_LIMIT_SIZE (CHUNK_UNIT * 3) + +#define TEST_LOOP 100 +#define PAGE_ALIGN(size, ps) (((size) + ((ps) - 1)) & ~((ps) - 1)) + +int set_cap_limits(rlim_t max) +{ + struct rlimit new; + cap_t cap = cap_init(); + + new.rlim_cur = max; + new.rlim_max = max; + if (setrlimit(RLIMIT_MEMLOCK, &new)) { + perror("setrlimit() returns error\n"); + return -1; + } + + /* drop capabilities including CAP_IPC_LOCK */ + if (cap_set_proc(cap)) { + perror("cap_set_proc() returns error\n"); + return -2; + } + + return 0; +} + +int get_proc_locked_vm_size(void) +{ + FILE *f; + int ret = -1; + char line[1024] = {0}; + unsigned long lock_size = 0; + + f = fopen("/proc/self/status", "r"); + if (!f) { + perror("fopen"); + return -1; + } + + while (fgets(line, 1024, f)) { + if (strstr(line, "VmLck")) { + ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size); + if (ret <= 0) { + printf("sscanf() on VmLck error: %s: %d\n", + line, ret); + fclose(f); + return -1; + } + fclose(f); + return (int)(lock_size << 10); + } + } + + perror("cannot parse VmLck in /proc/self/status\n"); + fclose(f); + return -1; +} + +/* + * Get the MMUPageSize of the memory region including input + * address from proc file. + * + * return value: on error case, 0 will be returned. + * Otherwise the page size(in bytes) is returned. + */ +int get_proc_page_size(unsigned long addr) +{ + FILE *smaps; + char *line; + unsigned long mmupage_size = 0; + size_t size; + + smaps = seek_to_smaps_entry(addr); + if (!smaps) { + printf("Unable to parse /proc/self/smaps\n"); + return 0; + } + + while (getline(&line, &size, smaps) > 0) { + if (!strstr(line, "MMUPageSize")) { + free(line); + line = NULL; + size = 0; + continue; + } + + /* found the MMUPageSize of this section */ + if (sscanf(line, "MMUPageSize: %8lu kB", + &mmupage_size) < 1) { + printf("Unable to parse smaps entry for Size:%s\n", + line); + break; + } + + } + free(line); + if (smaps) + fclose(smaps); + return mmupage_size << 10; +} + +/* + * Test mlock/mlock2() on provided memory chunk. + * It expects the mlock/mlock2() to be successful (within rlimit) + * + * With allocated memory chunk [p, p + alloc_size), this + * test will choose start/len randomly to perform mlock/mlock2 + * [start, start + len] memory range. The range is within range + * of the allocated chunk. + * + * The memory region size alloc_size is within the rlimit. + * So we always expect a success of mlock/mlock2. + * + * VmLck is assumed to be 0 before this test. + * + * return value: 0 - success + * else: failure + */ +int test_mlock_within_limit(char *p, int alloc_size) +{ + int i; + int ret = 0; + int locked_vm_size = 0; + struct rlimit cur; + int page_size = 0; + + getrlimit(RLIMIT_MEMLOCK, &cur); + if (cur.rlim_cur < alloc_size) { + printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n", + alloc_size, (unsigned int)cur.rlim_cur); + return -1; + } + + srand(time(NULL)); + for (i = 0; i < TEST_LOOP; i++) { + /* + * - choose mlock/mlock2 randomly + * - choose lock_size randomly but lock_size < alloc_size + * - choose start_offset randomly but p+start_offset+lock_size + * < p+alloc_size + */ + int is_mlock = !!(rand() % 2); + int lock_size = rand() % alloc_size; + int start_offset = rand() % (alloc_size - lock_size); + + if (is_mlock) + ret = mlock(p + start_offset, lock_size); + else + ret = mlock2_(p + start_offset, lock_size, + MLOCK_ONFAULT); + + if (ret) { + printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n", + is_mlock ? "mlock" : "mlock2", + p, alloc_size, + p + start_offset, lock_size); + return ret; + } + } + + /* + * Check VmLck left by the tests. + */ + locked_vm_size = get_proc_locked_vm_size(); + page_size = get_proc_page_size((unsigned long)p); + if (page_size == 0) { + printf("cannot get proc MMUPageSize\n"); + return -1; + } + + if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) { + printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n", + locked_vm_size, alloc_size); + return -1; + } + + return 0; +} + + +/* + * We expect the mlock/mlock2() to be fail (outof limitation) + * + * With allocated memory chunk [p, p + alloc_size), this + * test will randomly choose start/len and perform mlock/mlock2 + * on [start, start+len] range. + * + * The memory region size alloc_size is above the rlimit. + * And the len to be locked is higher than rlimit. + * So we always expect a failure of mlock/mlock2. + * No locked page number should be increased as a side effect. + * + * return value: 0 - success + * else: failure + */ +int test_mlock_outof_limit(char *p, int alloc_size) +{ + int i; + int ret = 0; + int locked_vm_size = 0, old_locked_vm_size = 0; + struct rlimit cur; + + getrlimit(RLIMIT_MEMLOCK, &cur); + if (cur.rlim_cur >= alloc_size) { + printf("alloc_size[%d] >%u rlimit, violates test condition\n", + alloc_size, (unsigned int)cur.rlim_cur); + return -1; + } + + old_locked_vm_size = get_proc_locked_vm_size(); + srand(time(NULL)); + for (i = 0; i < TEST_LOOP; i++) { + int is_mlock = !!(rand() % 2); + int lock_size = (rand() % (alloc_size - cur.rlim_cur)) + + cur.rlim_cur; + int start_offset = rand() % (alloc_size - lock_size); + + if (is_mlock) + ret = mlock(p + start_offset, lock_size); + else + ret = mlock2_(p + start_offset, lock_size, + MLOCK_ONFAULT); + if (ret == 0) { + printf("%s() succeeds? on %p(%d) mlock%p(%d)\n", + is_mlock ? "mlock" : "mlock2", + p, alloc_size, + p + start_offset, lock_size); + return -1; + } + } + + locked_vm_size = get_proc_locked_vm_size(); + if (locked_vm_size != old_locked_vm_size) { + printf("tests leads to new mlocked page: old[%d], new[%d]\n", + old_locked_vm_size, + locked_vm_size); + return -1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + char *p = NULL; + int ret = 0; + + if (set_cap_limits(MLOCK_RLIMIT_SIZE)) + return -1; + + p = malloc(MLOCK_WITHIN_LIMIT_SIZE); + if (p == NULL) { + perror("malloc() failure\n"); + return -1; + } + ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE); + if (ret) + return ret; + munlock(p, MLOCK_WITHIN_LIMIT_SIZE); + free(p); + + + p = malloc(MLOCK_OUTOF_LIMIT_SIZE); + if (p == NULL) { + perror("malloc() failure\n"); + return -1; + } + ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE); + if (ret) + return ret; + munlock(p, MLOCK_OUTOF_LIMIT_SIZE); + free(p); + + return 0; +} diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c new file mode 100644 index 000000000000..11b2301f3aa3 --- /dev/null +++ b/tools/testing/selftests/mm/mlock2-tests.c @@ -0,0 +1,520 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include "mlock2.h" + +#include "../kselftest.h" + +struct vm_boundaries { + unsigned long start; + unsigned long end; +}; + +static int get_vm_area(unsigned long addr, struct vm_boundaries *area) +{ + FILE *file; + int ret = 1; + char line[1024] = {0}; + char *end_addr; + char *stop; + unsigned long start; + unsigned long end; + + if (!area) + return ret; + + file = fopen("/proc/self/maps", "r"); + if (!file) { + perror("fopen"); + return ret; + } + + memset(area, 0, sizeof(struct vm_boundaries)); + + while(fgets(line, 1024, file)) { + end_addr = strchr(line, '-'); + if (!end_addr) { + printf("cannot parse /proc/self/maps\n"); + goto out; + } + *end_addr = '\0'; + end_addr++; + stop = strchr(end_addr, ' '); + if (!stop) { + printf("cannot parse /proc/self/maps\n"); + goto out; + } + stop = '\0'; + + sscanf(line, "%lx", &start); + sscanf(end_addr, "%lx", &end); + + if (start <= addr && end > addr) { + area->start = start; + area->end = end; + ret = 0; + goto out; + } + } +out: + fclose(file); + return ret; +} + +#define VMFLAGS "VmFlags:" + +static bool is_vmflag_set(unsigned long addr, const char *vmflag) +{ + char *line = NULL; + char *flags; + size_t size = 0; + bool ret = false; + FILE *smaps; + + smaps = seek_to_smaps_entry(addr); + if (!smaps) { + printf("Unable to parse /proc/self/smaps\n"); + goto out; + } + + while (getline(&line, &size, smaps) > 0) { + if (!strstr(line, VMFLAGS)) { + free(line); + line = NULL; + size = 0; + continue; + } + + flags = line + strlen(VMFLAGS); + ret = (strstr(flags, vmflag) != NULL); + goto out; + } + +out: + free(line); + fclose(smaps); + return ret; +} + +#define SIZE "Size:" +#define RSS "Rss:" +#define LOCKED "lo" + +static unsigned long get_value_for_name(unsigned long addr, const char *name) +{ + char *line = NULL; + size_t size = 0; + char *value_ptr; + FILE *smaps = NULL; + unsigned long value = -1UL; + + smaps = seek_to_smaps_entry(addr); + if (!smaps) { + printf("Unable to parse /proc/self/smaps\n"); + goto out; + } + + while (getline(&line, &size, smaps) > 0) { + if (!strstr(line, name)) { + free(line); + line = NULL; + size = 0; + continue; + } + + value_ptr = line + strlen(name); + if (sscanf(value_ptr, "%lu kB", &value) < 1) { + printf("Unable to parse smaps entry for Size\n"); + goto out; + } + break; + } + +out: + if (smaps) + fclose(smaps); + free(line); + return value; +} + +static bool is_vma_lock_on_fault(unsigned long addr) +{ + bool locked; + unsigned long vma_size, vma_rss; + + locked = is_vmflag_set(addr, LOCKED); + if (!locked) + return false; + + vma_size = get_value_for_name(addr, SIZE); + vma_rss = get_value_for_name(addr, RSS); + + /* only one page is faulted in */ + return (vma_rss < vma_size); +} + +#define PRESENT_BIT 0x8000000000000000ULL +#define PFN_MASK 0x007FFFFFFFFFFFFFULL +#define UNEVICTABLE_BIT (1UL << 18) + +static int lock_check(unsigned long addr) +{ + bool locked; + unsigned long vma_size, vma_rss; + + locked = is_vmflag_set(addr, LOCKED); + if (!locked) + return false; + + vma_size = get_value_for_name(addr, SIZE); + vma_rss = get_value_for_name(addr, RSS); + + return (vma_rss == vma_size); +} + +static int unlock_lock_check(char *map) +{ + if (is_vmflag_set((unsigned long)map, LOCKED)) { + printf("VMA flag %s is present on page 1 after unlock\n", LOCKED); + return 1; + } + + return 0; +} + +static int test_mlock_lock() +{ + char *map; + int ret = 1; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) { + perror("test_mlock_locked mmap"); + goto out; + } + + if (mlock2_(map, 2 * page_size, 0)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(KSFT_SKIP); + } + perror("mlock2(0)"); + goto unmap; + } + + if (!lock_check((unsigned long)map)) + goto unmap; + + /* Now unlock and recheck attributes */ + if (munlock(map, 2 * page_size)) { + perror("munlock()"); + goto unmap; + } + + ret = unlock_lock_check(map); + +unmap: + munmap(map, 2 * page_size); +out: + return ret; +} + +static int onfault_check(char *map) +{ + *map = 'a'; + if (!is_vma_lock_on_fault((unsigned long)map)) { + printf("VMA is not marked for lock on fault\n"); + return 1; + } + + return 0; +} + +static int unlock_onfault_check(char *map) +{ + unsigned long page_size = getpagesize(); + + if (is_vma_lock_on_fault((unsigned long)map) || + is_vma_lock_on_fault((unsigned long)map + page_size)) { + printf("VMA is still lock on fault after unlock\n"); + return 1; + } + + return 0; +} + +static int test_mlock_onfault() +{ + char *map; + int ret = 1; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) { + perror("test_mlock_locked mmap"); + goto out; + } + + if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(KSFT_SKIP); + } + perror("mlock2(MLOCK_ONFAULT)"); + goto unmap; + } + + if (onfault_check(map)) + goto unmap; + + /* Now unlock and recheck attributes */ + if (munlock(map, 2 * page_size)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(KSFT_SKIP); + } + perror("munlock()"); + goto unmap; + } + + ret = unlock_onfault_check(map); +unmap: + munmap(map, 2 * page_size); +out: + return ret; +} + +static int test_lock_onfault_of_present() +{ + char *map; + int ret = 1; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) { + perror("test_mlock_locked mmap"); + goto out; + } + + *map = 'a'; + + if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(KSFT_SKIP); + } + perror("mlock2(MLOCK_ONFAULT)"); + goto unmap; + } + + if (!is_vma_lock_on_fault((unsigned long)map) || + !is_vma_lock_on_fault((unsigned long)map + page_size)) { + printf("VMA with present pages is not marked lock on fault\n"); + goto unmap; + } + ret = 0; +unmap: + munmap(map, 2 * page_size); +out: + return ret; +} + +static int test_munlockall() +{ + char *map; + int ret = 1; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + if (map == MAP_FAILED) { + perror("test_munlockall mmap"); + goto out; + } + + if (mlockall(MCL_CURRENT)) { + perror("mlockall(MCL_CURRENT)"); + goto out; + } + + if (!lock_check((unsigned long)map)) + goto unmap; + + if (munlockall()) { + perror("munlockall()"); + goto unmap; + } + + if (unlock_lock_check(map)) + goto unmap; + + munmap(map, 2 * page_size); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + if (map == MAP_FAILED) { + perror("test_munlockall second mmap"); + goto out; + } + + if (mlockall(MCL_CURRENT | MCL_ONFAULT)) { + perror("mlockall(MCL_CURRENT | MCL_ONFAULT)"); + goto unmap; + } + + if (onfault_check(map)) + goto unmap; + + if (munlockall()) { + perror("munlockall()"); + goto unmap; + } + + if (unlock_onfault_check(map)) + goto unmap; + + if (mlockall(MCL_CURRENT | MCL_FUTURE)) { + perror("mlockall(MCL_CURRENT | MCL_FUTURE)"); + goto out; + } + + if (!lock_check((unsigned long)map)) + goto unmap; + + if (munlockall()) { + perror("munlockall()"); + goto unmap; + } + + ret = unlock_lock_check(map); + +unmap: + munmap(map, 2 * page_size); +out: + munlockall(); + return ret; +} + +static int test_vma_management(bool call_mlock) +{ + int ret = 1; + void *map; + unsigned long page_size = getpagesize(); + struct vm_boundaries page1; + struct vm_boundaries page2; + struct vm_boundaries page3; + + map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) { + perror("mmap()"); + return ret; + } + + if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(KSFT_SKIP); + } + perror("mlock(ONFAULT)\n"); + goto out; + } + + if (get_vm_area((unsigned long)map, &page1) || + get_vm_area((unsigned long)map + page_size, &page2) || + get_vm_area((unsigned long)map + page_size * 2, &page3)) { + printf("couldn't find mapping in /proc/self/maps\n"); + goto out; + } + + /* + * Before we unlock a portion, we need to that all three pages are in + * the same VMA. If they are not we abort this test (Note that this is + * not a failure) + */ + if (page1.start != page2.start || page2.start != page3.start) { + printf("VMAs are not merged to start, aborting test\n"); + ret = 0; + goto out; + } + + if (munlock(map + page_size, page_size)) { + perror("munlock()"); + goto out; + } + + if (get_vm_area((unsigned long)map, &page1) || + get_vm_area((unsigned long)map + page_size, &page2) || + get_vm_area((unsigned long)map + page_size * 2, &page3)) { + printf("couldn't find mapping in /proc/self/maps\n"); + goto out; + } + + /* All three VMAs should be different */ + if (page1.start == page2.start || page2.start == page3.start) { + printf("failed to split VMA for munlock\n"); + goto out; + } + + /* Now unlock the first and third page and check the VMAs again */ + if (munlock(map, page_size * 3)) { + perror("munlock()"); + goto out; + } + + if (get_vm_area((unsigned long)map, &page1) || + get_vm_area((unsigned long)map + page_size, &page2) || + get_vm_area((unsigned long)map + page_size * 2, &page3)) { + printf("couldn't find mapping in /proc/self/maps\n"); + goto out; + } + + /* Now all three VMAs should be the same */ + if (page1.start != page2.start || page2.start != page3.start) { + printf("failed to merge VMAs after munlock\n"); + goto out; + } + + ret = 0; +out: + munmap(map, 3 * page_size); + return ret; +} + +static int test_mlockall(int (test_function)(bool call_mlock)) +{ + int ret = 1; + + if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) { + perror("mlockall"); + return ret; + } + + ret = test_function(false); + munlockall(); + return ret; +} + +int main(int argc, char **argv) +{ + int ret = 0; + ret += test_mlock_lock(); + ret += test_mlock_onfault(); + ret += test_munlockall(); + ret += test_lock_onfault_of_present(); + ret += test_vma_management(true); + ret += test_mlockall(test_vma_management); + return ret; +} diff --git a/tools/testing/selftests/mm/mlock2.h b/tools/testing/selftests/mm/mlock2.h new file mode 100644 index 000000000000..2a6e76c226bc --- /dev/null +++ b/tools/testing/selftests/mm/mlock2.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include + +#ifndef MLOCK_ONFAULT +#define MLOCK_ONFAULT 1 +#endif + +#ifndef MCL_ONFAULT +#define MCL_ONFAULT (MCL_FUTURE << 1) +#endif + +static int mlock2_(void *start, size_t len, int flags) +{ +#ifdef __NR_mlock2 + return syscall(__NR_mlock2, start, len, flags); +#else + errno = ENOSYS; + return -1; +#endif +} + +static FILE *seek_to_smaps_entry(unsigned long addr) +{ + FILE *file; + char *line = NULL; + size_t size = 0; + unsigned long start, end; + char perms[5]; + unsigned long offset; + char dev[32]; + unsigned long inode; + char path[BUFSIZ]; + + file = fopen("/proc/self/smaps", "r"); + if (!file) { + perror("fopen smaps"); + _exit(1); + } + + while (getline(&line, &size, file) > 0) { + if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n", + &start, &end, perms, &offset, dev, &inode, path) < 6) + goto next; + + if (start <= addr && addr < end) + goto out; + +next: + free(line); + line = NULL; + size = 0; + } + + fclose(file); + file = NULL; + +out: + free(line); + return file; +} diff --git a/tools/testing/selftests/mm/mrelease_test.c b/tools/testing/selftests/mm/mrelease_test.c new file mode 100644 index 000000000000..6c62966ab5db --- /dev/null +++ b/tools/testing/selftests/mm/mrelease_test.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2022 Google LLC + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include "util.h" + +#include "../kselftest.h" + +#ifndef __NR_pidfd_open +#define __NR_pidfd_open -1 +#endif + +#ifndef __NR_process_mrelease +#define __NR_process_mrelease -1 +#endif + +#define MB(x) (x << 20) +#define MAX_SIZE_MB 1024 + +static int alloc_noexit(unsigned long nr_pages, int pipefd) +{ + int ppid = getppid(); + int timeout = 10; /* 10sec timeout to get killed */ + unsigned long i; + char *buf; + + buf = (char *)mmap(NULL, nr_pages * PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, 0, 0); + if (buf == MAP_FAILED) { + perror("mmap failed, halting the test"); + return KSFT_FAIL; + } + + for (i = 0; i < nr_pages; i++) + *((unsigned long *)(buf + (i * PAGE_SIZE))) = i; + + /* Signal the parent that the child is ready */ + if (write(pipefd, "", 1) < 0) { + perror("write"); + return KSFT_FAIL; + } + + /* Wait to be killed (when reparenting happens) */ + while (getppid() == ppid && timeout > 0) { + sleep(1); + timeout--; + } + + munmap(buf, nr_pages * PAGE_SIZE); + + return (timeout > 0) ? KSFT_PASS : KSFT_FAIL; +} + +/* The process_mrelease calls in this test are expected to fail */ +static void run_negative_tests(int pidfd) +{ + int res; + /* Test invalid flags. Expect to fail with EINVAL error code. */ + if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) || + errno != EINVAL) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + perror("process_mrelease with wrong flags"); + exit(res); + } + /* + * Test reaping while process is alive with no pending SIGKILL. + * Expect to fail with EINVAL error code. + */ + if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + perror("process_mrelease on a live process"); + exit(res); + } +} + +static int child_main(int pipefd[], size_t size) +{ + int res; + + /* Allocate and fault-in memory and wait to be killed */ + close(pipefd[0]); + res = alloc_noexit(MB(size) / PAGE_SIZE, pipefd[1]); + close(pipefd[1]); + return res; +} + +int main(void) +{ + int pipefd[2], pidfd; + bool success, retry; + size_t size; + pid_t pid; + char byte; + int res; + + /* Test a wrong pidfd */ + if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + perror("process_mrelease with wrong pidfd"); + exit(res); + } + + /* Start the test with 1MB child memory allocation */ + size = 1; +retry: + /* + * Pipe for the child to signal when it's done allocating + * memory + */ + if (pipe(pipefd)) { + perror("pipe"); + exit(KSFT_FAIL); + } + pid = fork(); + if (pid < 0) { + perror("fork"); + close(pipefd[0]); + close(pipefd[1]); + exit(KSFT_FAIL); + } + + if (pid == 0) { + /* Child main routine */ + res = child_main(pipefd, size); + exit(res); + } + + /* + * Parent main routine: + * Wait for the child to finish allocations, then kill and reap + */ + close(pipefd[1]); + /* Block until the child is ready */ + res = read(pipefd[0], &byte, 1); + close(pipefd[0]); + if (res < 0) { + perror("read"); + if (!kill(pid, SIGKILL)) + waitpid(pid, NULL, 0); + exit(KSFT_FAIL); + } + + pidfd = syscall(__NR_pidfd_open, pid, 0); + if (pidfd < 0) { + perror("pidfd_open"); + if (!kill(pid, SIGKILL)) + waitpid(pid, NULL, 0); + exit(KSFT_FAIL); + } + + /* Run negative tests which require a live child */ + run_negative_tests(pidfd); + + if (kill(pid, SIGKILL)) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + perror("kill"); + exit(res); + } + + success = (syscall(__NR_process_mrelease, pidfd, 0) == 0); + if (!success) { + /* + * If we failed to reap because the child exited too soon, + * before we could call process_mrelease. Double child's memory + * which causes it to spend more time on cleanup and increases + * our chances of reaping its memory before it exits. + * Retry until we succeed or reach MAX_SIZE_MB. + */ + if (errno == ESRCH) { + retry = (size <= MAX_SIZE_MB); + } else { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + perror("process_mrelease"); + waitpid(pid, NULL, 0); + exit(res); + } + } + + /* Cleanup to prevent zombies */ + if (waitpid(pid, NULL, 0) < 0) { + perror("waitpid"); + exit(KSFT_FAIL); + } + close(pidfd); + + if (!success) { + if (retry) { + size *= 2; + goto retry; + } + printf("All process_mrelease attempts failed!\n"); + exit(KSFT_FAIL); + } + + printf("Success reaping a child with %zuMB of memory allocations\n", + size); + return KSFT_PASS; +} diff --git a/tools/testing/selftests/mm/mremap_dontunmap.c b/tools/testing/selftests/mm/mremap_dontunmap.c new file mode 100644 index 000000000000..f01dc4a85b0b --- /dev/null +++ b/tools/testing/selftests/mm/mremap_dontunmap.c @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Tests for mremap w/ MREMAP_DONTUNMAP. + * + * Copyright 2020, Brian Geffon + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +#ifndef MREMAP_DONTUNMAP +#define MREMAP_DONTUNMAP 4 +#endif + +unsigned long page_size; +char *page_buffer; + +static void dump_maps(void) +{ + char cmd[32]; + + snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid()); + system(cmd); +} + +#define BUG_ON(condition, description) \ + do { \ + if (condition) { \ + fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \ + __LINE__, (description), strerror(errno)); \ + dump_maps(); \ + exit(1); \ + } \ + } while (0) + +// Try a simple operation for to "test" for kernel support this prevents +// reporting tests as failed when it's run on an older kernel. +static int kernel_support_for_mremap_dontunmap() +{ + int ret = 0; + unsigned long num_pages = 1; + void *source_mapping = mmap(NULL, num_pages * page_size, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + + // This simple remap should only fail if MREMAP_DONTUNMAP isn't + // supported. + void *dest_mapping = + mremap(source_mapping, num_pages * page_size, num_pages * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE, 0); + if (dest_mapping == MAP_FAILED) { + ret = errno; + } else { + BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, + "unable to unmap destination mapping"); + } + + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); + return ret; +} + +// This helper will just validate that an entire mapping contains the expected +// byte. +static int check_region_contains_byte(void *addr, unsigned long size, char byte) +{ + BUG_ON(size & (page_size - 1), + "check_region_contains_byte expects page multiples"); + BUG_ON((unsigned long)addr & (page_size - 1), + "check_region_contains_byte expects page alignment"); + + memset(page_buffer, byte, page_size); + + unsigned long num_pages = size / page_size; + unsigned long i; + + // Compare each page checking that it contains our expected byte. + for (i = 0; i < num_pages; ++i) { + int ret = + memcmp(addr + (i * page_size), page_buffer, page_size); + if (ret) { + return ret; + } + } + + return 0; +} + +// this test validates that MREMAP_DONTUNMAP moves the pagetables while leaving +// the source mapping mapped. +static void mremap_dontunmap_simple() +{ + unsigned long num_pages = 5; + + void *source_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + + memset(source_mapping, 'a', num_pages * page_size); + + // Try to just move the whole mapping anywhere (not fixed). + void *dest_mapping = + mremap(source_mapping, num_pages * page_size, num_pages * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); + BUG_ON(dest_mapping == MAP_FAILED, "mremap"); + + // Validate that the pages have been moved, we know they were moved if + // the dest_mapping contains a's. + BUG_ON(check_region_contains_byte + (dest_mapping, num_pages * page_size, 'a') != 0, + "pages did not migrate"); + BUG_ON(check_region_contains_byte + (source_mapping, num_pages * page_size, 0) != 0, + "source should have no ptes"); + + BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); +} + +// This test validates that MREMAP_DONTUNMAP on a shared mapping works as expected. +static void mremap_dontunmap_simple_shmem() +{ + unsigned long num_pages = 5; + + int mem_fd = memfd_create("memfd", MFD_CLOEXEC); + BUG_ON(mem_fd < 0, "memfd_create"); + + BUG_ON(ftruncate(mem_fd, num_pages * page_size) < 0, + "ftruncate"); + + void *source_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_FILE | MAP_SHARED, mem_fd, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + + BUG_ON(close(mem_fd) < 0, "close"); + + memset(source_mapping, 'a', num_pages * page_size); + + // Try to just move the whole mapping anywhere (not fixed). + void *dest_mapping = + mremap(source_mapping, num_pages * page_size, num_pages * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); + if (dest_mapping == MAP_FAILED && errno == EINVAL) { + // Old kernel which doesn't support MREMAP_DONTUNMAP on shmem. + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); + return; + } + + BUG_ON(dest_mapping == MAP_FAILED, "mremap"); + + // Validate that the pages have been moved, we know they were moved if + // the dest_mapping contains a's. + BUG_ON(check_region_contains_byte + (dest_mapping, num_pages * page_size, 'a') != 0, + "pages did not migrate"); + + // Because the region is backed by shmem, we will actually see the same + // memory at the source location still. + BUG_ON(check_region_contains_byte + (source_mapping, num_pages * page_size, 'a') != 0, + "source should have no ptes"); + + BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); +} + +// This test validates MREMAP_DONTUNMAP will move page tables to a specific +// destination using MREMAP_FIXED, also while validating that the source +// remains intact. +static void mremap_dontunmap_simple_fixed() +{ + unsigned long num_pages = 5; + + // Since we want to guarantee that we can remap to a point, we will + // create a mapping up front. + void *dest_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(dest_mapping == MAP_FAILED, "mmap"); + memset(dest_mapping, 'X', num_pages * page_size); + + void *source_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + memset(source_mapping, 'a', num_pages * page_size); + + void *remapped_mapping = + mremap(source_mapping, num_pages * page_size, num_pages * page_size, + MREMAP_FIXED | MREMAP_DONTUNMAP | MREMAP_MAYMOVE, + dest_mapping); + BUG_ON(remapped_mapping == MAP_FAILED, "mremap"); + BUG_ON(remapped_mapping != dest_mapping, + "mremap should have placed the remapped mapping at dest_mapping"); + + // The dest mapping will have been unmap by mremap so we expect the Xs + // to be gone and replaced with a's. + BUG_ON(check_region_contains_byte + (dest_mapping, num_pages * page_size, 'a') != 0, + "pages did not migrate"); + + // And the source mapping will have had its ptes dropped. + BUG_ON(check_region_contains_byte + (source_mapping, num_pages * page_size, 0) != 0, + "source should have no ptes"); + + BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); +} + +// This test validates that we can MREMAP_DONTUNMAP for a portion of an +// existing mapping. +static void mremap_dontunmap_partial_mapping() +{ + /* + * source mapping: + * -------------- + * | aaaaaaaaaa | + * -------------- + * to become: + * -------------- + * | aaaaa00000 | + * -------------- + * With the destination mapping containing 5 pages of As. + * --------- + * | aaaaa | + * --------- + */ + unsigned long num_pages = 10; + void *source_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + memset(source_mapping, 'a', num_pages * page_size); + + // We will grab the last 5 pages of the source and move them. + void *dest_mapping = + mremap(source_mapping + (5 * page_size), 5 * page_size, + 5 * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); + BUG_ON(dest_mapping == MAP_FAILED, "mremap"); + + // We expect the first 5 pages of the source to contain a's and the + // final 5 pages to contain zeros. + BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 'a') != + 0, "first 5 pages of source should have original pages"); + BUG_ON(check_region_contains_byte + (source_mapping + (5 * page_size), 5 * page_size, 0) != 0, + "final 5 pages of source should have no ptes"); + + // Finally we expect the destination to have 5 pages worth of a's. + BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != + 0, "dest mapping should contain ptes from the source"); + + BUG_ON(munmap(dest_mapping, 5 * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); +} + +// This test validates that we can remap over only a portion of a mapping. +static void mremap_dontunmap_partial_mapping_overwrite(void) +{ + /* + * source mapping: + * --------- + * |aaaaa| + * --------- + * dest mapping initially: + * ----------- + * |XXXXXXXXXX| + * ------------ + * Source to become: + * --------- + * |00000| + * --------- + * With the destination mapping containing 5 pages of As. + * ------------ + * |aaaaaXXXXX| + * ------------ + */ + void *source_mapping = + mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + memset(source_mapping, 'a', 5 * page_size); + + void *dest_mapping = + mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(dest_mapping == MAP_FAILED, "mmap"); + memset(dest_mapping, 'X', 10 * page_size); + + // We will grab the last 5 pages of the source and move them. + void *remapped_mapping = + mremap(source_mapping, 5 * page_size, + 5 * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE | MREMAP_FIXED, dest_mapping); + BUG_ON(dest_mapping == MAP_FAILED, "mremap"); + BUG_ON(dest_mapping != remapped_mapping, "expected to remap to dest_mapping"); + + BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 0) != + 0, "first 5 pages of source should have no ptes"); + + // Finally we expect the destination to have 5 pages worth of a's. + BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != 0, + "dest mapping should contain ptes from the source"); + + // Finally the last 5 pages shouldn't have been touched. + BUG_ON(check_region_contains_byte(dest_mapping + (5 * page_size), + 5 * page_size, 'X') != 0, + "dest mapping should have retained the last 5 pages"); + + BUG_ON(munmap(dest_mapping, 10 * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, 5 * page_size) == -1, + "unable to unmap source mapping"); +} + +int main(void) +{ + page_size = sysconf(_SC_PAGE_SIZE); + + // test for kernel support for MREMAP_DONTUNMAP skipping the test if + // not. + if (kernel_support_for_mremap_dontunmap() != 0) { + printf("No kernel support for MREMAP_DONTUNMAP\n"); + return KSFT_SKIP; + } + + // Keep a page sized buffer around for when we need it. + page_buffer = + mmap(NULL, page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(page_buffer == MAP_FAILED, "unable to mmap a page."); + + mremap_dontunmap_simple(); + mremap_dontunmap_simple_shmem(); + mremap_dontunmap_simple_fixed(); + mremap_dontunmap_partial_mapping(); + mremap_dontunmap_partial_mapping_overwrite(); + + BUG_ON(munmap(page_buffer, page_size) == -1, + "unable to unmap page buffer"); + + printf("OK\n"); + return 0; +} diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c new file mode 100644 index 000000000000..9496346973d4 --- /dev/null +++ b/tools/testing/selftests/mm/mremap_test.c @@ -0,0 +1,475 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2020 Google LLC + */ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +#define EXPECT_SUCCESS 0 +#define EXPECT_FAILURE 1 +#define NON_OVERLAPPING 0 +#define OVERLAPPING 1 +#define NS_PER_SEC 1000000000ULL +#define VALIDATION_DEFAULT_THRESHOLD 4 /* 4MB */ +#define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */ + +#define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) + +struct config { + unsigned long long src_alignment; + unsigned long long dest_alignment; + unsigned long long region_size; + int overlapping; +}; + +struct test { + const char *name; + struct config config; + int expect_failure; +}; + +enum { + _1KB = 1ULL << 10, /* 1KB -> not page aligned */ + _4KB = 4ULL << 10, + _8KB = 8ULL << 10, + _1MB = 1ULL << 20, + _2MB = 2ULL << 20, + _4MB = 4ULL << 20, + _1GB = 1ULL << 30, + _2GB = 2ULL << 30, + PMD = _2MB, + PUD = _1GB, +}; + +#define PTE page_size + +#define MAKE_TEST(source_align, destination_align, size, \ + overlaps, should_fail, test_name) \ +(struct test){ \ + .name = test_name, \ + .config = { \ + .src_alignment = source_align, \ + .dest_alignment = destination_align, \ + .region_size = size, \ + .overlapping = overlaps, \ + }, \ + .expect_failure = should_fail \ +} + +/* + * Returns false if the requested remap region overlaps with an + * existing mapping (e.g text, stack) else returns true. + */ +static bool is_remap_region_valid(void *addr, unsigned long long size) +{ + void *remap_addr = NULL; + bool ret = true; + + /* Use MAP_FIXED_NOREPLACE flag to ensure region is not mapped */ + remap_addr = mmap(addr, size, PROT_READ | PROT_WRITE, + MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, + -1, 0); + + if (remap_addr == MAP_FAILED) { + if (errno == EEXIST) + ret = false; + } else { + munmap(remap_addr, size); + } + + return ret; +} + +/* Returns mmap_min_addr sysctl tunable from procfs */ +static unsigned long long get_mmap_min_addr(void) +{ + FILE *fp; + int n_matched; + static unsigned long long addr; + + if (addr) + return addr; + + fp = fopen("/proc/sys/vm/mmap_min_addr", "r"); + if (fp == NULL) { + ksft_print_msg("Failed to open /proc/sys/vm/mmap_min_addr: %s\n", + strerror(errno)); + exit(KSFT_SKIP); + } + + n_matched = fscanf(fp, "%llu", &addr); + if (n_matched != 1) { + ksft_print_msg("Failed to read /proc/sys/vm/mmap_min_addr: %s\n", + strerror(errno)); + fclose(fp); + exit(KSFT_SKIP); + } + + fclose(fp); + return addr; +} + +/* + * This test validates that merge is called when expanding a mapping. + * Mapping containing three pages is created, middle page is unmapped + * and then the mapping containing the first page is expanded so that + * it fills the created hole. The two parts should merge creating + * single mapping with three pages. + */ +static void mremap_expand_merge(unsigned long page_size) +{ + char *test_name = "mremap expand merge"; + FILE *fp; + char *line = NULL; + size_t len = 0; + bool success = false; + char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + munmap(start + page_size, page_size); + mremap(start, page_size, 2 * page_size, 0); + + fp = fopen("/proc/self/maps", "r"); + if (fp == NULL) { + ksft_test_result_fail("%s\n", test_name); + return; + } + + while (getline(&line, &len, fp) != -1) { + char *first = strtok(line, "- "); + void *first_val = (void *)strtol(first, NULL, 16); + char *second = strtok(NULL, "- "); + void *second_val = (void *) strtol(second, NULL, 16); + + if (first_val == start && second_val == start + 3 * page_size) { + success = true; + break; + } + } + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); + fclose(fp); +} + +/* + * Returns the start address of the mapping on success, else returns + * NULL on failure. + */ +static void *get_source_mapping(struct config c) +{ + unsigned long long addr = 0ULL; + void *src_addr = NULL; + unsigned long long mmap_min_addr; + + mmap_min_addr = get_mmap_min_addr(); + +retry: + addr += c.src_alignment; + if (addr < mmap_min_addr) + goto retry; + + src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE, + MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, + -1, 0); + if (src_addr == MAP_FAILED) { + if (errno == EPERM || errno == EEXIST) + goto retry; + goto error; + } + /* + * Check that the address is aligned to the specified alignment. + * Addresses which have alignments that are multiples of that + * specified are not considered valid. For instance, 1GB address is + * 2MB-aligned, however it will not be considered valid for a + * requested alignment of 2MB. This is done to reduce coincidental + * alignment in the tests. + */ + if (((unsigned long long) src_addr & (c.src_alignment - 1)) || + !((unsigned long long) src_addr & c.src_alignment)) { + munmap(src_addr, c.region_size); + goto retry; + } + + if (!src_addr) + goto error; + + return src_addr; +error: + ksft_print_msg("Failed to map source region: %s\n", + strerror(errno)); + return NULL; +} + +/* Returns the time taken for the remap on success else returns -1. */ +static long long remap_region(struct config c, unsigned int threshold_mb, + char pattern_seed) +{ + void *addr, *src_addr, *dest_addr; + unsigned long long i; + struct timespec t_start = {0, 0}, t_end = {0, 0}; + long long start_ns, end_ns, align_mask, ret, offset; + unsigned long long threshold; + + if (threshold_mb == VALIDATION_NO_THRESHOLD) + threshold = c.region_size; + else + threshold = MIN(threshold_mb * _1MB, c.region_size); + + src_addr = get_source_mapping(c); + if (!src_addr) { + ret = -1; + goto out; + } + + /* Set byte pattern */ + srand(pattern_seed); + for (i = 0; i < threshold; i++) + memset((char *) src_addr + i, (char) rand(), 1); + + /* Mask to zero out lower bits of address for alignment */ + align_mask = ~(c.dest_alignment - 1); + /* Offset of destination address from the end of the source region */ + offset = (c.overlapping) ? -c.dest_alignment : c.dest_alignment; + addr = (void *) (((unsigned long long) src_addr + c.region_size + + offset) & align_mask); + + /* See comment in get_source_mapping() */ + if (!((unsigned long long) addr & c.dest_alignment)) + addr = (void *) ((unsigned long long) addr | c.dest_alignment); + + /* Don't destroy existing mappings unless expected to overlap */ + while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) { + /* Check for unsigned overflow */ + if (addr + c.dest_alignment < addr) { + ksft_print_msg("Couldn't find a valid region to remap to\n"); + ret = -1; + goto out; + } + addr += c.dest_alignment; + } + + clock_gettime(CLOCK_MONOTONIC, &t_start); + dest_addr = mremap(src_addr, c.region_size, c.region_size, + MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr); + clock_gettime(CLOCK_MONOTONIC, &t_end); + + if (dest_addr == MAP_FAILED) { + ksft_print_msg("mremap failed: %s\n", strerror(errno)); + ret = -1; + goto clean_up_src; + } + + /* Verify byte pattern after remapping */ + srand(pattern_seed); + for (i = 0; i < threshold; i++) { + char c = (char) rand(); + + if (((char *) dest_addr)[i] != c) { + ksft_print_msg("Data after remap doesn't match at offset %d\n", + i); + ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, + ((char *) dest_addr)[i] & 0xff); + ret = -1; + goto clean_up_dest; + } + } + + start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec; + end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec; + ret = end_ns - start_ns; + +/* + * Since the destination address is specified using MREMAP_FIXED, subsequent + * mremap will unmap any previous mapping at the address range specified by + * dest_addr and region_size. This significantly affects the remap time of + * subsequent tests. So we clean up mappings after each test. + */ +clean_up_dest: + munmap(dest_addr, c.region_size); +clean_up_src: + munmap(src_addr, c.region_size); +out: + return ret; +} + +static void run_mremap_test_case(struct test test_case, int *failures, + unsigned int threshold_mb, + unsigned int pattern_seed) +{ + long long remap_time = remap_region(test_case.config, threshold_mb, + pattern_seed); + + if (remap_time < 0) { + if (test_case.expect_failure) + ksft_test_result_xfail("%s\n\tExpected mremap failure\n", + test_case.name); + else { + ksft_test_result_fail("%s\n", test_case.name); + *failures += 1; + } + } else { + /* + * Comparing mremap time is only applicable if entire region + * was faulted in. + */ + if (threshold_mb == VALIDATION_NO_THRESHOLD || + test_case.config.region_size <= threshold_mb * _1MB) + ksft_test_result_pass("%s\n\tmremap time: %12lldns\n", + test_case.name, remap_time); + else + ksft_test_result_pass("%s\n", test_case.name); + } +} + +static void usage(const char *cmd) +{ + fprintf(stderr, + "Usage: %s [[-t ] [-p ]]\n" + "-t\t only validate threshold_mb of the remapped region\n" + " \t if 0 is supplied no threshold is used; all tests\n" + " \t are run and remapped regions validated fully.\n" + " \t The default threshold used is 4MB.\n" + "-p\t provide a seed to generate the random pattern for\n" + " \t validating the remapped region.\n", cmd); +} + +static int parse_args(int argc, char **argv, unsigned int *threshold_mb, + unsigned int *pattern_seed) +{ + const char *optstr = "t:p:"; + int opt; + + while ((opt = getopt(argc, argv, optstr)) != -1) { + switch (opt) { + case 't': + *threshold_mb = atoi(optarg); + break; + case 'p': + *pattern_seed = atoi(optarg); + break; + default: + usage(argv[0]); + return -1; + } + } + + if (optind < argc) { + usage(argv[0]); + return -1; + } + + return 0; +} + +#define MAX_TEST 13 +#define MAX_PERF_TEST 3 +int main(int argc, char **argv) +{ + int failures = 0; + int i, run_perf_tests; + unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; + unsigned int pattern_seed; + int num_expand_tests = 1; + struct test test_cases[MAX_TEST]; + struct test perf_test_cases[MAX_PERF_TEST]; + int page_size; + time_t t; + + pattern_seed = (unsigned int) time(&t); + + if (parse_args(argc, argv, &threshold_mb, &pattern_seed) < 0) + exit(EXIT_FAILURE); + + ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n", + threshold_mb, pattern_seed); + + page_size = sysconf(_SC_PAGESIZE); + + /* Expected mremap failures */ + test_cases[0] = MAKE_TEST(page_size, page_size, page_size, + OVERLAPPING, EXPECT_FAILURE, + "mremap - Source and Destination Regions Overlapping"); + + test_cases[1] = MAKE_TEST(page_size, page_size/4, page_size, + NON_OVERLAPPING, EXPECT_FAILURE, + "mremap - Destination Address Misaligned (1KB-aligned)"); + test_cases[2] = MAKE_TEST(page_size/4, page_size, page_size, + NON_OVERLAPPING, EXPECT_FAILURE, + "mremap - Source Address Misaligned (1KB-aligned)"); + + /* Src addr PTE aligned */ + test_cases[3] = MAKE_TEST(PTE, PTE, PTE * 2, + NON_OVERLAPPING, EXPECT_SUCCESS, + "8KB mremap - Source PTE-aligned, Destination PTE-aligned"); + + /* Src addr 1MB aligned */ + test_cases[4] = MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2MB mremap - Source 1MB-aligned, Destination PTE-aligned"); + test_cases[5] = MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned"); + + /* Src addr PMD aligned */ + test_cases[6] = MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "4MB mremap - Source PMD-aligned, Destination PTE-aligned"); + test_cases[7] = MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "4MB mremap - Source PMD-aligned, Destination 1MB-aligned"); + test_cases[8] = MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "4MB mremap - Source PMD-aligned, Destination PMD-aligned"); + + /* Src addr PUD aligned */ + test_cases[9] = MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination PTE-aligned"); + test_cases[10] = MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination 1MB-aligned"); + test_cases[11] = MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination PMD-aligned"); + test_cases[12] = MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination PUD-aligned"); + + perf_test_cases[0] = MAKE_TEST(page_size, page_size, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "1GB mremap - Source PTE-aligned, Destination PTE-aligned"); + /* + * mremap 1GB region - Page table level aligned time + * comparison. + */ + perf_test_cases[1] = MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "1GB mremap - Source PMD-aligned, Destination PMD-aligned"); + perf_test_cases[2] = MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "1GB mremap - Source PUD-aligned, Destination PUD-aligned"); + + run_perf_tests = (threshold_mb == VALIDATION_NO_THRESHOLD) || + (threshold_mb * _1MB >= _1GB); + + ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ? + ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests); + + for (i = 0; i < ARRAY_SIZE(test_cases); i++) + run_mremap_test_case(test_cases[i], &failures, threshold_mb, + pattern_seed); + + mremap_expand_merge(page_size); + + if (run_perf_tests) { + ksft_print_msg("\n%s\n", + "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); + for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++) + run_mremap_test_case(perf_test_cases[i], &failures, + threshold_mb, pattern_seed); + } + + if (failures > 0) + ksft_exit_fail(); + else + ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/on-fault-limit.c b/tools/testing/selftests/mm/on-fault-limit.c new file mode 100644 index 000000000000..634d87dfb2a4 --- /dev/null +++ b/tools/testing/selftests/mm/on-fault-limit.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include + +#ifndef MCL_ONFAULT +#define MCL_ONFAULT (MCL_FUTURE << 1) +#endif + +static int test_limit(void) +{ + int ret = 1; + struct rlimit lims; + void *map; + + if (getrlimit(RLIMIT_MEMLOCK, &lims)) { + perror("getrlimit"); + return ret; + } + + if (mlockall(MCL_ONFAULT | MCL_FUTURE)) { + perror("mlockall"); + return ret; + } + + map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0); + if (map != MAP_FAILED) + printf("mmap should have failed, but didn't\n"); + else { + ret = 0; + munmap(map, 2 * lims.rlim_max); + } + + munlockall(); + return ret; +} + +int main(int argc, char **argv) +{ + int ret = 0; + + ret += test_limit(); + return ret; +} diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h new file mode 100644 index 000000000000..92f3be3dd8e5 --- /dev/null +++ b/tools/testing/selftests/mm/pkey-helpers.h @@ -0,0 +1,226 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PKEYS_HELPER_H +#define _PKEYS_HELPER_H +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +/* Define some kernel-like types */ +#define u8 __u8 +#define u16 __u16 +#define u32 __u32 +#define u64 __u64 + +#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) + +#ifndef DEBUG_LEVEL +#define DEBUG_LEVEL 0 +#endif +#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 +extern int dprint_in_signal; +extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; + +extern int test_nr; +extern int iteration_nr; + +#ifdef __GNUC__ +__attribute__((format(printf, 1, 2))) +#endif +static inline void sigsafe_printf(const char *format, ...) +{ + va_list ap; + + if (!dprint_in_signal) { + va_start(ap, format); + vprintf(format, ap); + va_end(ap); + } else { + int ret; + /* + * No printf() functions are signal-safe. + * They deadlock easily. Write the format + * string to get some output, even if + * incomplete. + */ + ret = write(1, format, strlen(format)); + if (ret < 0) + exit(1); + } +} +#define dprintf_level(level, args...) do { \ + if (level <= DEBUG_LEVEL) \ + sigsafe_printf(args); \ +} while (0) +#define dprintf0(args...) dprintf_level(0, args) +#define dprintf1(args...) dprintf_level(1, args) +#define dprintf2(args...) dprintf_level(2, args) +#define dprintf3(args...) dprintf_level(3, args) +#define dprintf4(args...) dprintf_level(4, args) + +extern void abort_hooks(void); +#define pkey_assert(condition) do { \ + if (!(condition)) { \ + dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ + __FILE__, __LINE__, \ + test_nr, iteration_nr); \ + dprintf0("errno at assert: %d", errno); \ + abort_hooks(); \ + exit(__LINE__); \ + } \ +} while (0) + +__attribute__((noinline)) int read_ptr(int *ptr); +void expected_pkey_fault(int pkey); +int sys_pkey_alloc(unsigned long flags, unsigned long init_val); +int sys_pkey_free(unsigned long pkey); +int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey); +void record_pkey_malloc(void *ptr, long size, int prot); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ +#include "pkey-x86.h" +#elif defined(__powerpc64__) /* arch */ +#include "pkey-powerpc.h" +#else /* arch */ +#error Architecture not supported +#endif /* arch */ + +#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) + +static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) +{ + u32 shift = pkey_bit_position(pkey); + /* mask out bits from pkey in old value */ + reg &= ~((u64)PKEY_MASK << shift); + /* OR in new bits for pkey */ + reg |= (flags & PKEY_MASK) << shift; + return reg; +} + +static inline u64 get_pkey_bits(u64 reg, int pkey) +{ + u32 shift = pkey_bit_position(pkey); + /* + * shift down the relevant bits to the lowest two, then + * mask off all the other higher bits + */ + return ((reg >> shift) & PKEY_MASK); +} + +extern u64 shadow_pkey_reg; + +static inline u64 _read_pkey_reg(int line) +{ + u64 pkey_reg = __read_pkey_reg(); + + dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx" + " shadow: %016llx\n", + line, pkey_reg, shadow_pkey_reg); + assert(pkey_reg == shadow_pkey_reg); + + return pkey_reg; +} + +#define read_pkey_reg() _read_pkey_reg(__LINE__) + +static inline void write_pkey_reg(u64 pkey_reg) +{ + dprintf4("%s() changing %016llx to %016llx\n", __func__, + __read_pkey_reg(), pkey_reg); + /* will do the shadow check for us: */ + read_pkey_reg(); + __write_pkey_reg(pkey_reg); + shadow_pkey_reg = pkey_reg; + dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__, + pkey_reg, __read_pkey_reg()); +} + +/* + * These are technically racy. since something could + * change PKEY register between the read and the write. + */ +static inline void __pkey_access_allow(int pkey, int do_allow) +{ + u64 pkey_reg = read_pkey_reg(); + int bit = pkey * 2; + + if (do_allow) + pkey_reg &= (1<si_pkey; +#else + return (u32 *)(((u8 *)si) + si_pkey_offset); +#endif +} + +static inline int kernel_has_pkeys(void) +{ + /* try allocating a key and see if it succeeds */ + int ret = sys_pkey_alloc(0, 0); + if (ret <= 0) { + return 0; + } + sys_pkey_free(ret); + return 1; +} + +static inline int is_pkeys_supported(void) +{ + /* check if the cpu supports pkeys */ + if (!cpu_has_pkeys()) { + dprintf1("SKIP: %s: no CPU support\n", __func__); + return 0; + } + + /* check if the kernel supports pkeys */ + if (!kernel_has_pkeys()) { + dprintf1("SKIP: %s: no kernel support\n", __func__); + return 0; + } + + return 1; +} + +#endif /* _PKEYS_HELPER_H */ diff --git a/tools/testing/selftests/mm/pkey-powerpc.h b/tools/testing/selftests/mm/pkey-powerpc.h new file mode 100644 index 000000000000..1ebb586b2fbc --- /dev/null +++ b/tools/testing/selftests/mm/pkey-powerpc.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_POWERPC_H +#define _PKEYS_POWERPC_H + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 386 +#endif +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 384 +# define SYS_pkey_free 385 +#endif +#define REG_IP_IDX PT_NIP +#define REG_TRAPNO PT_TRAP +#define gregs gp_regs +#define fpregs fp_regs +#define si_pkey_offset 0x20 + +#undef PKEY_DISABLE_ACCESS +#define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */ + +#undef PKEY_DISABLE_WRITE +#define PKEY_DISABLE_WRITE 0x2 + +#define NR_PKEYS 32 +#define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey + and 24 other keys that cannot be + represented in the PTE */ +#define NR_RESERVED_PKEYS_64K_3KEYS 3 /* PowerNV and KVM: pkey-0, + pkey-1 and exec-only key */ +#define NR_RESERVED_PKEYS_64K_4KEYS 4 /* PowerVM: pkey-0, pkey-1, + pkey-31 and exec-only key */ +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL << 24) +#define PAGE_SIZE sysconf(_SC_PAGESIZE) + +static inline u32 pkey_bit_position(int pkey) +{ + return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY; +} + +static inline u64 __read_pkey_reg(void) +{ + u64 pkey_reg; + + asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg)); + + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + u64 amr = pkey_reg; + + dprintf4("%s() changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); + + asm volatile("isync; mtspr 0xd, %0; isync" + : : "r" ((unsigned long)(amr)) : "memory"); + + dprintf4("%s() pkey register after changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); +} + +static inline int cpu_has_pkeys(void) +{ + /* No simple way to determine this */ + return 1; +} + +static inline bool arch_is_powervm() +{ + struct stat buf; + + if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) && + (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) && + (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) ) + return true; + + return false; +} + +static inline int get_arch_reserved_keys(void) +{ + if (sysconf(_SC_PAGESIZE) == 4096) + return NR_RESERVED_PKEYS_4K; + else + if (arch_is_powervm()) + return NR_RESERVED_PKEYS_64K_4KEYS; + else + return NR_RESERVED_PKEYS_64K_3KEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + /* + * powerpc does not allow userspace to change permissions of exec-only + * keys since those keys are not allocated by userspace. The signal + * handler wont be able to reset the permissions, which means the code + * will infinitely continue to segfault here. + */ + return; +} + +/* 4-byte instructions * 16384 = 64K page */ +#define __page_o_noops() asm(".rept 16384 ; nop; .endr") + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + void *ptr; + int ret; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + + ret = syscall(__NR_subpage_prot, ptr, size, NULL); + if (ret) { + perror("subpage_perm"); + return PTR_ERR_ENOTSUP; + } + + ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); + pkey_assert(!ret); + record_pkey_malloc(ptr, size, prot); + + dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); + return ptr; +} + +#endif /* _PKEYS_POWERPC_H */ diff --git a/tools/testing/selftests/mm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h new file mode 100644 index 000000000000..72c14cd3ddc7 --- /dev/null +++ b/tools/testing/selftests/mm/pkey-x86.h @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_X86_H +#define _PKEYS_X86_H + +#ifdef __i386__ + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 380 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 381 +# define SYS_pkey_free 382 +#endif + +#define REG_IP_IDX REG_EIP +#define si_pkey_offset 0x14 + +#else + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 329 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 330 +# define SYS_pkey_free 331 +#endif + +#define REG_IP_IDX REG_RIP +#define si_pkey_offset 0x20 + +#endif + +#ifndef PKEY_DISABLE_ACCESS +# define PKEY_DISABLE_ACCESS 0x1 +#endif + +#ifndef PKEY_DISABLE_WRITE +# define PKEY_DISABLE_WRITE 0x2 +#endif + +#define NR_PKEYS 16 +#define NR_RESERVED_PKEYS 2 /* pkey-0 and exec-only-pkey */ +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL<<21) +#define PAGE_SIZE 4096 +#define MB (1<<20) + +static inline void __page_o_noops(void) +{ + /* 8-bytes of instruction * 512 bytes = 1 page */ + asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); +} + +static inline u64 __read_pkey_reg(void) +{ + unsigned int eax, edx; + unsigned int ecx = 0; + unsigned pkey_reg; + + asm volatile(".byte 0x0f,0x01,0xee\n\t" + : "=a" (eax), "=d" (edx) + : "c" (ecx)); + pkey_reg = eax; + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + unsigned int eax = pkey_reg; + unsigned int ecx = 0; + unsigned int edx = 0; + + dprintf4("%s() changing %016llx to %016llx\n", __func__, + __read_pkey_reg(), pkey_reg); + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (eax), "c" (ecx), "d" (edx)); + assert(pkey_reg == __read_pkey_reg()); +} + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ +#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ + +static inline int cpu_has_pkeys(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + + __cpuid_count(0x7, 0x0, eax, ebx, ecx, edx); + + if (!(ecx & X86_FEATURE_PKU)) { + dprintf2("cpu does not have PKU\n"); + return 0; + } + if (!(ecx & X86_FEATURE_OSPKE)) { + dprintf2("cpu does not have OSPKE\n"); + return 0; + } + return 1; +} + +static inline int cpu_max_xsave_size(void) +{ + unsigned long XSTATE_CPUID = 0xd; + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + + __cpuid_count(XSTATE_CPUID, 0, eax, ebx, ecx, edx); + return ecx; +} + +static inline u32 pkey_bit_position(int pkey) +{ + return pkey * PKEY_BITS_PER_PKEY; +} + +#define XSTATE_PKEY_BIT (9) +#define XSTATE_PKEY 0x200 +#define XSTATE_BV_OFFSET 512 + +int pkey_reg_xstate_offset(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + int xstate_offset; + int xstate_size; + unsigned long XSTATE_CPUID = 0xd; + int leaf; + + /* assume that XSTATE_PKEY is set in XCR0 */ + leaf = XSTATE_PKEY_BIT; + { + __cpuid_count(XSTATE_CPUID, leaf, eax, ebx, ecx, edx); + + if (leaf == XSTATE_PKEY_BIT) { + xstate_offset = ebx; + xstate_size = eax; + } + } + + if (xstate_size == 0) { + printf("could not find size/offset of PKEY in xsave state\n"); + return 0; + } + + return xstate_offset; +} + +static inline int get_arch_reserved_keys(void) +{ + return NR_RESERVED_PKEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + int ptr_contents; + + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + expected_pkey_fault(pkey); +} + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + return PTR_ERR_ENOTSUP; +} + +#endif /* _PKEYS_X86_H */ diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c new file mode 100644 index 000000000000..95f403a0c46d --- /dev/null +++ b/tools/testing/selftests/mm/protection_keys.c @@ -0,0 +1,1788 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst) + * + * There are examples in here of: + * * how to set protection keys on memory + * * how to set/clear bits in pkey registers (the rights register) + * * how to handle SEGV_PKUERR signals and extract pkey-relevant + * information from the siginfo + * + * Things to add: + * make sure KSM and KSM COW breaking works + * prefault pages in at malloc, or not + * protect MPX bounds tables with protection keys? + * make sure VMA splitting/merging is working correctly + * OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys + * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel + * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks + * + * Compile like this: + * gcc -mxsave -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm + * gcc -mxsave -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm + */ +#define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pkey-helpers.h" + +int iteration_nr = 1; +int test_nr; + +u64 shadow_pkey_reg; +int dprint_in_signal; +char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; + +void cat_into_file(char *str, char *file) +{ + int fd = open(file, O_RDWR); + int ret; + + dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file); + /* + * these need to be raw because they are called under + * pkey_assert() + */ + if (fd < 0) { + fprintf(stderr, "error opening '%s'\n", str); + perror("error: "); + exit(__LINE__); + } + + ret = write(fd, str, strlen(str)); + if (ret != strlen(str)) { + perror("write to file failed"); + fprintf(stderr, "filename: '%s' str: '%s'\n", file, str); + exit(__LINE__); + } + close(fd); +} + +#if CONTROL_TRACING > 0 +static int warned_tracing; +int tracing_root_ok(void) +{ + if (geteuid() != 0) { + if (!warned_tracing) + fprintf(stderr, "WARNING: not run as root, " + "can not do tracing control\n"); + warned_tracing = 1; + return 0; + } + return 1; +} +#endif + +void tracing_on(void) +{ +#if CONTROL_TRACING > 0 +#define TRACEDIR "/sys/kernel/debug/tracing" + char pidstr[32]; + + if (!tracing_root_ok()) + return; + + sprintf(pidstr, "%d", getpid()); + cat_into_file("0", TRACEDIR "/tracing_on"); + cat_into_file("\n", TRACEDIR "/trace"); + if (1) { + cat_into_file("function_graph", TRACEDIR "/current_tracer"); + cat_into_file("1", TRACEDIR "/options/funcgraph-proc"); + } else { + cat_into_file("nop", TRACEDIR "/current_tracer"); + } + cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid"); + cat_into_file("1", TRACEDIR "/tracing_on"); + dprintf1("enabled tracing\n"); +#endif +} + +void tracing_off(void) +{ +#if CONTROL_TRACING > 0 + if (!tracing_root_ok()) + return; + cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on"); +#endif +} + +void abort_hooks(void) +{ + fprintf(stderr, "running %s()...\n", __func__); + tracing_off(); +#ifdef SLEEP_ON_ABORT + sleep(SLEEP_ON_ABORT); +#endif +} + +/* + * This attempts to have roughly a page of instructions followed by a few + * instructions that do a write, and another page of instructions. That + * way, we are pretty sure that the write is in the second page of + * instructions and has at least a page of padding behind it. + * + * *That* lets us be sure to madvise() away the write instruction, which + * will then fault, which makes sure that the fault code handles + * execute-only memory properly. + */ +#ifdef __powerpc64__ +/* This way, both 4K and 64K alignment are maintained */ +__attribute__((__aligned__(65536))) +#else +__attribute__((__aligned__(PAGE_SIZE))) +#endif +void lots_o_noops_around_write(int *write_to_me) +{ + dprintf3("running %s()\n", __func__); + __page_o_noops(); + /* Assume this happens in the second page of instructions: */ + *write_to_me = __LINE__; + /* pad out by another page: */ + __page_o_noops(); + dprintf3("%s() done\n", __func__); +} + +void dump_mem(void *dumpme, int len_bytes) +{ + char *c = (void *)dumpme; + int i; + + for (i = 0; i < len_bytes; i += sizeof(u64)) { + u64 *ptr = (u64 *)(c + i); + dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr); + } +} + +static u32 hw_pkey_get(int pkey, unsigned long flags) +{ + u64 pkey_reg = __read_pkey_reg(); + + dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", + __func__, pkey, flags, 0, 0); + dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); + + return (u32) get_pkey_bits(pkey_reg, pkey); +} + +static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) +{ + u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); + u64 old_pkey_reg = __read_pkey_reg(); + u64 new_pkey_reg; + + /* make sure that 'rights' only contains the bits we expect: */ + assert(!(rights & ~mask)); + + /* modify bits accordingly in old pkey_reg and assign it */ + new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights); + + __write_pkey_reg(new_pkey_reg); + + dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" + " pkey_reg now: %016llx old_pkey_reg: %016llx\n", + __func__, pkey, rights, flags, 0, __read_pkey_reg(), + old_pkey_reg); + return 0; +} + +void pkey_disable_set(int pkey, int flags) +{ + unsigned long syscall_flags = 0; + int ret; + int pkey_rights; + u64 orig_pkey_reg = read_pkey_reg(); + + dprintf1("START->%s(%d, 0x%x)\n", __func__, + pkey, flags); + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + pkey_assert(pkey_rights >= 0); + + pkey_rights |= flags; + + ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); + assert(!ret); + /* pkey_reg and flags have the same format */ + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); + dprintf1("%s(%d) shadow: 0x%016llx\n", + __func__, pkey, shadow_pkey_reg); + + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", + __func__, pkey, read_pkey_reg()); + if (flags) + pkey_assert(read_pkey_reg() >= orig_pkey_reg); + dprintf1("END<---%s(%d, 0x%x)\n", __func__, + pkey, flags); +} + +void pkey_disable_clear(int pkey, int flags) +{ + unsigned long syscall_flags = 0; + int ret; + int pkey_rights = hw_pkey_get(pkey, syscall_flags); + u64 orig_pkey_reg = read_pkey_reg(); + + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + pkey_assert(pkey_rights >= 0); + + pkey_rights &= ~flags; + + ret = hw_pkey_set(pkey, pkey_rights, 0); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, + pkey, read_pkey_reg()); + if (flags) + assert(read_pkey_reg() <= orig_pkey_reg); +} + +void pkey_write_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); +} +void pkey_write_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_WRITE); +} +void pkey_access_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); +} +void pkey_access_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); +} + +/* Failed address bound checks: */ +#ifndef SEGV_BNDERR +# define SEGV_BNDERR 3 +#endif + +#ifndef SEGV_PKUERR +# define SEGV_PKUERR 4 +#endif + +static char *si_code_str(int si_code) +{ + if (si_code == SEGV_MAPERR) + return "SEGV_MAPERR"; + if (si_code == SEGV_ACCERR) + return "SEGV_ACCERR"; + if (si_code == SEGV_BNDERR) + return "SEGV_BNDERR"; + if (si_code == SEGV_PKUERR) + return "SEGV_PKUERR"; + return "UNKNOWN"; +} + +int pkey_faults; +int last_si_pkey = -1; +void signal_handler(int signum, siginfo_t *si, void *vucontext) +{ + ucontext_t *uctxt = vucontext; + int trapno; + unsigned long ip; + char *fpregs; +#if defined(__i386__) || defined(__x86_64__) /* arch */ + u32 *pkey_reg_ptr; + int pkey_reg_offset; +#endif /* arch */ + u64 siginfo_pkey; + u32 *si_pkey_ptr; + + dprint_in_signal = 1; + dprintf1(">>>>===============SIGSEGV============================\n"); + dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", + __func__, __LINE__, + __read_pkey_reg(), shadow_pkey_reg); + + trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; + ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; + fpregs = (char *) uctxt->uc_mcontext.fpregs; + + dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", + __func__, trapno, ip, si_code_str(si->si_code), + si->si_code); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ +#ifdef __i386__ + /* + * 32-bit has some extra padding so that userspace can tell whether + * the XSTATE header is present in addition to the "legacy" FPU + * state. We just assume that it is here. + */ + fpregs += 0x70; +#endif /* i386 */ + pkey_reg_offset = pkey_reg_xstate_offset(); + pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]); + + /* + * If we got a PKEY fault, we *HAVE* to have at least one bit set in + * here. + */ + dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset()); + if (DEBUG_LEVEL > 4) + dump_mem(pkey_reg_ptr - 128, 256); + pkey_assert(*pkey_reg_ptr); +#endif /* arch */ + + dprintf1("siginfo: %p\n", si); + dprintf1(" fpregs: %p\n", fpregs); + + if ((si->si_code == SEGV_MAPERR) || + (si->si_code == SEGV_ACCERR) || + (si->si_code == SEGV_BNDERR)) { + printf("non-PK si_code, exiting...\n"); + exit(4); + } + + si_pkey_ptr = siginfo_get_pkey_ptr(si); + dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); + dump_mem((u8 *)si_pkey_ptr - 8, 24); + siginfo_pkey = *si_pkey_ptr; + pkey_assert(siginfo_pkey < NR_PKEYS); + last_si_pkey = siginfo_pkey; + + /* + * need __read_pkey_reg() version so we do not do shadow_pkey_reg + * checking + */ + dprintf1("signal pkey_reg from pkey_reg: %016llx\n", + __read_pkey_reg()); + dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey); +#if defined(__i386__) || defined(__x86_64__) /* arch */ + dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr); + *(u64 *)pkey_reg_ptr = 0x00000000; + dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n"); +#elif defined(__powerpc64__) /* arch */ + /* restore access and let the faulting instruction continue */ + pkey_access_allow(siginfo_pkey); +#endif /* arch */ + pkey_faults++; + dprintf1("<<<<==================================================\n"); + dprint_in_signal = 0; +} + +int wait_all_children(void) +{ + int status; + return waitpid(-1, &status, 0); +} + +void sig_chld(int x) +{ + dprint_in_signal = 1; + dprintf2("[%d] SIGCHLD: %d\n", getpid(), x); + dprint_in_signal = 0; +} + +void setup_sigsegv_handler(void) +{ + int r, rs; + struct sigaction newact; + struct sigaction oldact; + + /* #PF is mapped to sigsegv */ + int signum = SIGSEGV; + + newact.sa_handler = 0; + newact.sa_sigaction = signal_handler; + + /*sigset_t - signals to block while in the handler */ + /* get the old signal mask. */ + rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask); + pkey_assert(rs == 0); + + /* call sa_sigaction, not sa_handler*/ + newact.sa_flags = SA_SIGINFO; + + newact.sa_restorer = 0; /* void(*)(), obsolete */ + r = sigaction(signum, &newact, &oldact); + r = sigaction(SIGALRM, &newact, &oldact); + pkey_assert(r == 0); +} + +void setup_handlers(void) +{ + signal(SIGCHLD, &sig_chld); + setup_sigsegv_handler(); +} + +pid_t fork_lazy_child(void) +{ + pid_t forkret; + + forkret = fork(); + pkey_assert(forkret >= 0); + dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); + + if (!forkret) { + /* in the child */ + while (1) { + dprintf1("child sleeping...\n"); + sleep(30); + } + } + return forkret; +} + +int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey) +{ + int sret; + + dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__, + ptr, size, orig_prot, pkey); + + errno = 0; + sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey); + if (errno) { + dprintf2("SYS_mprotect_key sret: %d\n", sret); + dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); + dprintf2("SYS_mprotect_key failed, errno: %d\n", errno); + if (DEBUG_LEVEL >= 2) + perror("SYS_mprotect_pkey"); + } + return sret; +} + +int sys_pkey_alloc(unsigned long flags, unsigned long init_val) +{ + int ret = syscall(SYS_pkey_alloc, flags, init_val); + dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n", + __func__, flags, init_val, ret, errno); + return ret; +} + +int alloc_pkey(void) +{ + int ret; + unsigned long init_val = 0x0; + + dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", + __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); + ret = sys_pkey_alloc(0, init_val); + /* + * pkey_alloc() sets PKEY register, so we need to reflect it in + * shadow_pkey_reg: + */ + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + if (ret > 0) { + /* clear both the bits: */ + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, + ~PKEY_MASK); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, + __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + /* + * move the new state in from init_val + * (remember, we cheated and init_val == pkey_reg format) + */ + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, + init_val); + } + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno); + /* for shadow checking: */ + read_pkey_reg(); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + return ret; +} + +int sys_pkey_free(unsigned long pkey) +{ + int ret = syscall(SYS_pkey_free, pkey); + dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret); + return ret; +} + +/* + * I had a bug where pkey bits could be set by mprotect() but + * not cleared. This ensures we get lots of random bit sets + * and clears on the vma and pte pkey bits. + */ +int alloc_random_pkey(void) +{ + int max_nr_pkey_allocs; + int ret; + int i; + int alloced_pkeys[NR_PKEYS]; + int nr_alloced = 0; + int random_index; + memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); + + /* allocate every possible key and make a note of which ones we got */ + max_nr_pkey_allocs = NR_PKEYS; + for (i = 0; i < max_nr_pkey_allocs; i++) { + int new_pkey = alloc_pkey(); + if (new_pkey < 0) + break; + alloced_pkeys[nr_alloced++] = new_pkey; + } + + pkey_assert(nr_alloced > 0); + /* select a random one out of the allocated ones */ + random_index = rand() % nr_alloced; + ret = alloced_pkeys[random_index]; + /* now zero it out so we don't free it next */ + alloced_pkeys[random_index] = 0; + + /* go through the allocated ones that we did not want and free them */ + for (i = 0; i < nr_alloced; i++) { + int free_ret; + if (!alloced_pkeys[i]) + continue; + free_ret = sys_pkey_free(alloced_pkeys[i]); + pkey_assert(!free_ret); + } + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, + __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); + return ret; +} + +int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey) +{ + int nr_iterations = random() % 100; + int ret; + + while (0) { + int rpkey = alloc_random_pkey(); + ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); + dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", + ptr, size, orig_prot, pkey, ret); + if (nr_iterations-- < 0) + break; + + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + sys_pkey_free(rpkey); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + } + pkey_assert(pkey < NR_PKEYS); + + ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); + dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", + ptr, size, orig_prot, pkey, ret); + pkey_assert(!ret); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, + __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); + return ret; +} + +struct pkey_malloc_record { + void *ptr; + long size; + int prot; +}; +struct pkey_malloc_record *pkey_malloc_records; +struct pkey_malloc_record *pkey_last_malloc_record; +long nr_pkey_malloc_records; +void record_pkey_malloc(void *ptr, long size, int prot) +{ + long i; + struct pkey_malloc_record *rec = NULL; + + for (i = 0; i < nr_pkey_malloc_records; i++) { + rec = &pkey_malloc_records[i]; + /* find a free record */ + if (rec) + break; + } + if (!rec) { + /* every record is full */ + size_t old_nr_records = nr_pkey_malloc_records; + size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1); + size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record); + dprintf2("new_nr_records: %zd\n", new_nr_records); + dprintf2("new_size: %zd\n", new_size); + pkey_malloc_records = realloc(pkey_malloc_records, new_size); + pkey_assert(pkey_malloc_records != NULL); + rec = &pkey_malloc_records[nr_pkey_malloc_records]; + /* + * realloc() does not initialize memory, so zero it from + * the first new record all the way to the end. + */ + for (i = 0; i < new_nr_records - old_nr_records; i++) + memset(rec + i, 0, sizeof(*rec)); + } + dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n", + (int)(rec - pkey_malloc_records), rec, ptr, size); + rec->ptr = ptr; + rec->size = size; + rec->prot = prot; + pkey_last_malloc_record = rec; + nr_pkey_malloc_records++; +} + +void free_pkey_malloc(void *ptr) +{ + long i; + int ret; + dprintf3("%s(%p)\n", __func__, ptr); + for (i = 0; i < nr_pkey_malloc_records; i++) { + struct pkey_malloc_record *rec = &pkey_malloc_records[i]; + dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n", + ptr, i, rec, rec->ptr, rec->size); + if ((ptr < rec->ptr) || + (ptr >= rec->ptr + rec->size)) + continue; + + dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n", + ptr, i, rec, rec->ptr, rec->size); + nr_pkey_malloc_records--; + ret = munmap(rec->ptr, rec->size); + dprintf3("munmap ret: %d\n", ret); + pkey_assert(!ret); + dprintf3("clearing rec->ptr, rec: %p\n", rec); + rec->ptr = NULL; + dprintf3("done clearing rec->ptr, rec: %p\n", rec); + return; + } + pkey_assert(false); +} + + +void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) +{ + void *ptr; + int ret; + + read_pkey_reg(); + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); + pkey_assert(!ret); + record_pkey_malloc(ptr, size, prot); + read_pkey_reg(); + + dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); + return ptr; +} + +void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) +{ + int ret; + void *ptr; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + /* + * Guarantee we can fit at least one huge page in the resulting + * allocation by allocating space for 2: + */ + size = ALIGN_UP(size, HPAGE_SIZE * 2); + ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + record_pkey_malloc(ptr, size, prot); + mprotect_pkey(ptr, size, prot, pkey); + + dprintf1("unaligned ptr: %p\n", ptr); + ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE); + dprintf1(" aligned ptr: %p\n", ptr); + ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE); + dprintf1("MADV_HUGEPAGE ret: %d\n", ret); + ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED); + dprintf1("MADV_WILLNEED ret: %d\n", ret); + memset(ptr, 0, HPAGE_SIZE); + + dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr); + return ptr; +} + +int hugetlb_setup_ok; +#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" +#define GET_NR_HUGE_PAGES 10 +void setup_hugetlbfs(void) +{ + int err; + int fd; + char buf[256]; + long hpagesz_kb; + long hpagesz_mb; + + if (geteuid() != 0) { + fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); + return; + } + + cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages"); + + /* + * Now go make sure that we got the pages and that they + * are PMD-level pages. Someone might have made PUD-level + * pages the default. + */ + hpagesz_kb = HPAGE_SIZE / 1024; + hpagesz_mb = hpagesz_kb / 1024; + sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb); + fd = open(buf, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); + return; + } + + /* -1 to guarantee leaving the trailing \0 */ + err = read(fd, buf, sizeof(buf)-1); + close(fd); + if (err <= 0) { + fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); + return; + } + + if (atoi(buf) != GET_NR_HUGE_PAGES) { + fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n", + hpagesz_mb, buf, GET_NR_HUGE_PAGES); + return; + } + + hugetlb_setup_ok = 1; +} + +void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) +{ + void *ptr; + int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB; + + if (!hugetlb_setup_ok) + return PTR_ERR_ENOTSUP; + + dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey); + size = ALIGN_UP(size, HPAGE_SIZE * 2); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0); + pkey_assert(ptr != (void *)-1); + mprotect_pkey(ptr, size, prot, pkey); + + record_pkey_malloc(ptr, size, prot); + + dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr); + return ptr; +} + +void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) +{ + void *ptr; + int fd; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + fd = open("/dax/foo", O_RDWR); + pkey_assert(fd >= 0); + + ptr = mmap(0, size, prot, MAP_SHARED, fd, 0); + pkey_assert(ptr != (void *)-1); + + mprotect_pkey(ptr, size, prot, pkey); + + record_pkey_malloc(ptr, size, prot); + + dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr); + close(fd); + return ptr; +} + +void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { + + malloc_pkey_with_mprotect, + malloc_pkey_with_mprotect_subpage, + malloc_pkey_anon_huge, + malloc_pkey_hugetlb +/* can not do direct with the pkey_mprotect() API: + malloc_pkey_mmap_direct, + malloc_pkey_mmap_dax, +*/ +}; + +void *malloc_pkey(long size, int prot, u16 pkey) +{ + void *ret; + static int malloc_type; + int nr_malloc_types = ARRAY_SIZE(pkey_malloc); + + pkey_assert(pkey < NR_PKEYS); + + while (1) { + pkey_assert(malloc_type < nr_malloc_types); + + ret = pkey_malloc[malloc_type](size, prot, pkey); + pkey_assert(ret != (void *)-1); + + malloc_type++; + if (malloc_type >= nr_malloc_types) + malloc_type = (random()%nr_malloc_types); + + /* try again if the malloc_type we tried is unsupported */ + if (ret == PTR_ERR_ENOTSUP) + continue; + + break; + } + + dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__, + size, prot, pkey, ret); + return ret; +} + +int last_pkey_faults; +#define UNKNOWN_PKEY -2 +void expected_pkey_fault(int pkey) +{ + dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n", + __func__, last_pkey_faults, pkey_faults); + dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); + pkey_assert(last_pkey_faults + 1 == pkey_faults); + + /* + * For exec-only memory, we do not know the pkey in + * advance, so skip this check. + */ + if (pkey != UNKNOWN_PKEY) + pkey_assert(last_si_pkey == pkey); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ + /* + * The signal handler shold have cleared out PKEY register to let the + * test program continue. We now have to restore it. + */ + if (__read_pkey_reg() != 0) +#else /* arch */ + if (__read_pkey_reg() != shadow_pkey_reg) +#endif /* arch */ + pkey_assert(0); + + __write_pkey_reg(shadow_pkey_reg); + dprintf1("%s() set pkey_reg=%016llx to restore state after signal " + "nuked it\n", __func__, shadow_pkey_reg); + last_pkey_faults = pkey_faults; + last_si_pkey = -1; +} + +#define do_not_expect_pkey_fault(msg) do { \ + if (last_pkey_faults != pkey_faults) \ + dprintf0("unexpected PKey fault: %s\n", msg); \ + pkey_assert(last_pkey_faults == pkey_faults); \ +} while (0) + +int test_fds[10] = { -1 }; +int nr_test_fds; +void __save_test_fd(int fd) +{ + pkey_assert(fd >= 0); + pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds)); + test_fds[nr_test_fds] = fd; + nr_test_fds++; +} + +int get_test_read_fd(void) +{ + int test_fd = open("/etc/passwd", O_RDONLY); + __save_test_fd(test_fd); + return test_fd; +} + +void close_test_fds(void) +{ + int i; + + for (i = 0; i < nr_test_fds; i++) { + if (test_fds[i] < 0) + continue; + close(test_fds[i]); + test_fds[i] = -1; + } + nr_test_fds = 0; +} + +#define barrier() __asm__ __volatile__("": : :"memory") +__attribute__((noinline)) int read_ptr(int *ptr) +{ + /* + * Keep GCC from optimizing this away somehow + */ + barrier(); + return *ptr; +} + +void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) +{ + int i, err; + int max_nr_pkey_allocs; + int alloced_pkeys[NR_PKEYS]; + int nr_alloced = 0; + long size; + + pkey_assert(pkey_last_malloc_record); + size = pkey_last_malloc_record->size; + /* + * This is a bit of a hack. But mprotect() requires + * huge-page-aligned sizes when operating on hugetlbfs. + * So, make sure that we use something that's a multiple + * of a huge page when we can. + */ + if (size >= HPAGE_SIZE) + size = HPAGE_SIZE; + + /* allocate every possible key and make sure key-0 never got allocated */ + max_nr_pkey_allocs = NR_PKEYS; + for (i = 0; i < max_nr_pkey_allocs; i++) { + int new_pkey = alloc_pkey(); + pkey_assert(new_pkey != 0); + + if (new_pkey < 0) + break; + alloced_pkeys[nr_alloced++] = new_pkey; + } + /* free all the allocated keys */ + for (i = 0; i < nr_alloced; i++) { + int free_ret; + + if (!alloced_pkeys[i]) + continue; + free_ret = sys_pkey_free(alloced_pkeys[i]); + pkey_assert(!free_ret); + } + + /* attach key-0 in various modes */ + err = sys_mprotect_pkey(ptr, size, PROT_READ, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0); + pkey_assert(!err); +} + +void test_read_of_write_disabled_region(int *ptr, u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling write access to PKEY[1], doing read\n"); + pkey_write_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + dprintf1("\n"); +} +void test_read_of_access_disabled_region(int *ptr, u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); + read_pkey_reg(); + pkey_access_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + expected_pkey_fault(pkey); +} + +void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", + pkey, ptr); + ptr_contents = read_ptr(ptr); + dprintf1("reading ptr before disabling the read : %d\n", + ptr_contents); + read_pkey_reg(); + pkey_access_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + expected_pkey_fault(pkey); +} + +void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling write access; after accessing the page, " + "to PKEY[%02d], doing write\n", pkey); + pkey_write_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + +void test_write_of_write_disabled_region(int *ptr, u16 pkey) +{ + dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); + pkey_write_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} +void test_write_of_access_disabled_region(int *ptr, u16 pkey) +{ + dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); + pkey_access_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + +void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling access; after accessing the page, " + " to PKEY[%02d], doing write\n", pkey); + pkey_access_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + +void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) +{ + int ret; + int test_fd = get_test_read_fd(); + + dprintf1("disabling access to PKEY[%02d], " + "having kernel read() to buffer\n", pkey); + pkey_access_deny(pkey); + ret = read(test_fd, ptr, 1); + dprintf1("read ret: %d\n", ret); + pkey_assert(ret); +} +void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) +{ + int ret; + int test_fd = get_test_read_fd(); + + pkey_write_deny(pkey); + ret = read(test_fd, ptr, 100); + dprintf1("read ret: %d\n", ret); + if (ret < 0 && (DEBUG_LEVEL > 0)) + perror("verbose read result (OK for this to be bad)"); + pkey_assert(ret); +} + +void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) +{ + int pipe_ret, vmsplice_ret; + struct iovec iov; + int pipe_fds[2]; + + pipe_ret = pipe(pipe_fds); + + pkey_assert(pipe_ret == 0); + dprintf1("disabling access to PKEY[%02d], " + "having kernel vmsplice from buffer\n", pkey); + pkey_access_deny(pkey); + iov.iov_base = ptr; + iov.iov_len = PAGE_SIZE; + vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT); + dprintf1("vmsplice() ret: %d\n", vmsplice_ret); + pkey_assert(vmsplice_ret == -1); + + close(pipe_fds[0]); + close(pipe_fds[1]); +} + +void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) +{ + int ignored = 0xdada; + int futex_ret; + int some_int = __LINE__; + + dprintf1("disabling write to PKEY[%02d], " + "doing futex gunk in buffer\n", pkey); + *ptr = some_int; + pkey_write_deny(pkey); + futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL, + &ignored, ignored); + if (DEBUG_LEVEL > 0) + perror("futex"); + dprintf1("futex() ret: %d\n", futex_ret); +} + +/* Assumes that all pkeys other than 'pkey' are unallocated */ +void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) +{ + int err; + int i; + + /* Note: 0 is the default pkey, so don't mess with it */ + for (i = 1; i < NR_PKEYS; i++) { + if (pkey == i) + continue; + + dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i); + err = sys_pkey_free(i); + pkey_assert(err); + + err = sys_pkey_free(i); + pkey_assert(err); + + err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i); + pkey_assert(err); + } +} + +/* Assumes that all pkeys other than 'pkey' are unallocated */ +void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) +{ + int err; + int bad_pkey = NR_PKEYS+99; + + /* pass a known-invalid pkey in: */ + err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey); + pkey_assert(err); +} + +void become_child(void) +{ + pid_t forkret; + + forkret = fork(); + pkey_assert(forkret >= 0); + dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); + + if (!forkret) { + /* in the child */ + return; + } + exit(0); +} + +/* Assumes that all pkeys other than 'pkey' are unallocated */ +void test_pkey_alloc_exhaust(int *ptr, u16 pkey) +{ + int err; + int allocated_pkeys[NR_PKEYS] = {0}; + int nr_allocated_pkeys = 0; + int i; + + for (i = 0; i < NR_PKEYS*3; i++) { + int new_pkey; + dprintf1("%s() alloc loop: %d\n", __func__, i); + new_pkey = alloc_pkey(); + dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, err, __read_pkey_reg(), + shadow_pkey_reg); + read_pkey_reg(); /* for shadow checking */ + dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); + if ((new_pkey == -1) && (errno == ENOSPC)) { + dprintf2("%s() failed to allocate pkey after %d tries\n", + __func__, nr_allocated_pkeys); + } else { + /* + * Ensure the number of successes never + * exceeds the number of keys supported + * in the hardware. + */ + pkey_assert(nr_allocated_pkeys < NR_PKEYS); + allocated_pkeys[nr_allocated_pkeys++] = new_pkey; + } + + /* + * Make sure that allocation state is properly + * preserved across fork(). + */ + if (i == NR_PKEYS*2) + become_child(); + } + + dprintf3("%s()::%d\n", __func__, __LINE__); + + /* + * On x86: + * There are 16 pkeys supported in hardware. Three are + * allocated by the time we get here: + * 1. The default key (0) + * 2. One possibly consumed by an execute-only mapping. + * 3. One allocated by the test code and passed in via + * 'pkey' to this function. + * Ensure that we can allocate at least another 13 (16-3). + * + * On powerpc: + * There are either 5, 28, 29 or 32 pkeys supported in + * hardware depending on the page size (4K or 64K) and + * platform (powernv or powervm). Four are allocated by + * the time we get here. These include pkey-0, pkey-1, + * exec-only pkey and the one allocated by the test code. + * Ensure that we can allocate the remaining. + */ + pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1)); + + for (i = 0; i < nr_allocated_pkeys; i++) { + err = sys_pkey_free(allocated_pkeys[i]); + pkey_assert(!err); + read_pkey_reg(); /* for shadow checking */ + } +} + +void arch_force_pkey_reg_init(void) +{ +#if defined(__i386__) || defined(__x86_64__) /* arch */ + u64 *buf; + + /* + * All keys should be allocated and set to allow reads and + * writes, so the register should be all 0. If not, just + * skip the test. + */ + if (read_pkey_reg()) + return; + + /* + * Just allocate an absurd about of memory rather than + * doing the XSAVE size enumeration dance. + */ + buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + + /* These __builtins require compiling with -mxsave */ + + /* XSAVE to build a valid buffer: */ + __builtin_ia32_xsave(buf, XSTATE_PKEY); + /* Clear XSTATE_BV[PKRU]: */ + buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY; + /* XRSTOR will likely get PKRU back to the init state: */ + __builtin_ia32_xrstor(buf, XSTATE_PKEY); + + munmap(buf, 1*MB); +#endif +} + + +/* + * This is mostly useless on ppc for now. But it will not + * hurt anything and should give some better coverage as + * a long-running test that continually checks the pkey + * register. + */ +void test_pkey_init_state(int *ptr, u16 pkey) +{ + int err; + int allocated_pkeys[NR_PKEYS] = {0}; + int nr_allocated_pkeys = 0; + int i; + + for (i = 0; i < NR_PKEYS; i++) { + int new_pkey = alloc_pkey(); + + if (new_pkey < 0) + continue; + allocated_pkeys[nr_allocated_pkeys++] = new_pkey; + } + + dprintf3("%s()::%d\n", __func__, __LINE__); + + arch_force_pkey_reg_init(); + + /* + * Loop for a bit, hoping to get exercise the kernel + * context switch code. + */ + for (i = 0; i < 1000000; i++) + read_pkey_reg(); + + for (i = 0; i < nr_allocated_pkeys; i++) { + err = sys_pkey_free(allocated_pkeys[i]); + pkey_assert(!err); + read_pkey_reg(); /* for shadow checking */ + } +} + +/* + * pkey 0 is special. It is allocated by default, so you do not + * have to call pkey_alloc() to use it first. Make sure that it + * is usable. + */ +void test_mprotect_with_pkey_0(int *ptr, u16 pkey) +{ + long size; + int prot; + + assert(pkey_last_malloc_record); + size = pkey_last_malloc_record->size; + /* + * This is a bit of a hack. But mprotect() requires + * huge-page-aligned sizes when operating on hugetlbfs. + * So, make sure that we use something that's a multiple + * of a huge page when we can. + */ + if (size >= HPAGE_SIZE) + size = HPAGE_SIZE; + prot = pkey_last_malloc_record->prot; + + /* Use pkey 0 */ + mprotect_pkey(ptr, size, prot, 0); + + /* Make sure that we can set it back to the original pkey. */ + mprotect_pkey(ptr, size, prot, pkey); +} + +void test_ptrace_of_child(int *ptr, u16 pkey) +{ + __attribute__((__unused__)) int peek_result; + pid_t child_pid; + void *ignored = 0; + long ret; + int status; + /* + * This is the "control" for our little expermient. Make sure + * we can always access it when ptracing. + */ + int *plain_ptr_unaligned = malloc(HPAGE_SIZE); + int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE); + + /* + * Fork a child which is an exact copy of this process, of course. + * That means we can do all of our tests via ptrace() and then plain + * memory access and ensure they work differently. + */ + child_pid = fork_lazy_child(); + dprintf1("[%d] child pid: %d\n", getpid(), child_pid); + + ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored); + if (ret) + perror("attach"); + dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__); + pkey_assert(ret != -1); + ret = waitpid(child_pid, &status, WUNTRACED); + if ((ret != child_pid) || !(WIFSTOPPED(status))) { + fprintf(stderr, "weird waitpid result %ld stat %x\n", + ret, status); + pkey_assert(0); + } + dprintf2("waitpid ret: %ld\n", ret); + dprintf2("waitpid status: %d\n", status); + + pkey_access_deny(pkey); + pkey_write_deny(pkey); + + /* Write access, untested for now: + ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data); + pkey_assert(ret != -1); + dprintf1("poke at %p: %ld\n", peek_at, ret); + */ + + /* + * Try to access the pkey-protected "ptr" via ptrace: + */ + ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored); + /* expect it to work, without an error: */ + pkey_assert(ret != -1); + /* Now access from the current task, and expect an exception: */ + peek_result = read_ptr(ptr); + expected_pkey_fault(pkey); + + /* + * Try to access the NON-pkey-protected "plain_ptr" via ptrace: + */ + ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored); + /* expect it to work, without an error: */ + pkey_assert(ret != -1); + /* Now access from the current task, and expect NO exception: */ + peek_result = read_ptr(plain_ptr); + do_not_expect_pkey_fault("read plain pointer after ptrace"); + + ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); + pkey_assert(ret != -1); + + ret = kill(child_pid, SIGKILL); + pkey_assert(ret != -1); + + wait(&status); + + free(plain_ptr_unaligned); +} + +void *get_pointer_to_instructions(void) +{ + void *p1; + + p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE); + dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write); + /* lots_o_noops_around_write should be page-aligned already */ + assert(p1 == &lots_o_noops_around_write); + + /* Point 'p1' at the *second* page of the function: */ + p1 += PAGE_SIZE; + + /* + * Try to ensure we fault this in on next touch to ensure + * we get an instruction fault as opposed to a data one + */ + madvise(p1, PAGE_SIZE, MADV_DONTNEED); + + return p1; +} + +void test_executing_on_unreadable_memory(int *ptr, u16 pkey) +{ + void *p1; + int scratch; + int ptr_contents; + int ret; + + p1 = get_pointer_to_instructions(); + lots_o_noops_around_write(&scratch); + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + + ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey); + pkey_assert(!ret); + pkey_access_deny(pkey); + + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); + + /* + * Make sure this is an *instruction* fault + */ + madvise(p1, PAGE_SIZE, MADV_DONTNEED); + lots_o_noops_around_write(&scratch); + do_not_expect_pkey_fault("executing on PROT_EXEC memory"); + expect_fault_on_read_execonly_key(p1, pkey); +} + +void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) +{ + void *p1; + int scratch; + int ptr_contents; + int ret; + + dprintf1("%s() start\n", __func__); + + p1 = get_pointer_to_instructions(); + lots_o_noops_around_write(&scratch); + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + + /* Use a *normal* mprotect(), not mprotect_pkey(): */ + ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); + pkey_assert(!ret); + + /* + * Reset the shadow, assuming that the above mprotect() + * correctly changed PKRU, but to an unknown value since + * the actual allocated pkey is unknown. + */ + shadow_pkey_reg = __read_pkey_reg(); + + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); + + /* Make sure this is an *instruction* fault */ + madvise(p1, PAGE_SIZE, MADV_DONTNEED); + lots_o_noops_around_write(&scratch); + do_not_expect_pkey_fault("executing on PROT_EXEC memory"); + expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY); + + /* + * Put the memory back to non-PROT_EXEC. Should clear the + * exec-only pkey off the VMA and allow it to be readable + * again. Go to PROT_NONE first to check for a kernel bug + * that did not clear the pkey when doing PROT_NONE. + */ + ret = mprotect(p1, PAGE_SIZE, PROT_NONE); + pkey_assert(!ret); + + ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC); + pkey_assert(!ret); + ptr_contents = read_ptr(p1); + do_not_expect_pkey_fault("plain read on recently PROT_EXEC area"); +} + +#if defined(__i386__) || defined(__x86_64__) +void test_ptrace_modifies_pkru(int *ptr, u16 pkey) +{ + u32 new_pkru; + pid_t child; + int status, ret; + int pkey_offset = pkey_reg_xstate_offset(); + size_t xsave_size = cpu_max_xsave_size(); + void *xsave; + u32 *pkey_register; + u64 *xstate_bv; + struct iovec iov; + + new_pkru = ~read_pkey_reg(); + /* Don't make PROT_EXEC mappings inaccessible */ + new_pkru &= ~3; + + child = fork(); + pkey_assert(child >= 0); + dprintf3("[%d] fork() ret: %d\n", getpid(), child); + if (!child) { + ptrace(PTRACE_TRACEME, 0, 0, 0); + /* Stop and allow the tracer to modify PKRU directly */ + raise(SIGSTOP); + + /* + * need __read_pkey_reg() version so we do not do shadow_pkey_reg + * checking + */ + if (__read_pkey_reg() != new_pkru) + exit(1); + + /* Stop and allow the tracer to clear XSTATE_BV for PKRU */ + raise(SIGSTOP); + + if (__read_pkey_reg() != 0) + exit(1); + + /* Stop and allow the tracer to examine PKRU */ + raise(SIGSTOP); + + exit(0); + } + + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + xsave = (void *)malloc(xsave_size); + pkey_assert(xsave > 0); + + /* Modify the PKRU register directly */ + iov.iov_base = xsave; + iov.iov_len = xsave_size; + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + + pkey_register = (u32 *)(xsave + pkey_offset); + pkey_assert(*pkey_register == read_pkey_reg()); + + *pkey_register = new_pkru; + + ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + + /* Test that the modification is visible in ptrace before any execution */ + memset(xsave, 0xCC, xsave_size); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + pkey_assert(*pkey_register == new_pkru); + + /* Execute the tracee */ + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + + /* Test that the tracee saw the PKRU value change */ + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + /* Test that the modification is visible in ptrace after execution */ + memset(xsave, 0xCC, xsave_size); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + pkey_assert(*pkey_register == new_pkru); + + /* Clear the PKRU bit from XSTATE_BV */ + xstate_bv = (u64 *)(xsave + 512); + *xstate_bv &= ~(1 << 9); + + ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + + /* Test that the modification is visible in ptrace before any execution */ + memset(xsave, 0xCC, xsave_size); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + pkey_assert(*pkey_register == 0); + + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + + /* Test that the tracee saw the PKRU value go to 0 */ + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + /* Test that the modification is visible in ptrace after execution */ + memset(xsave, 0xCC, xsave_size); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + pkey_assert(*pkey_register == 0); + + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFEXITED(status)); + pkey_assert(WEXITSTATUS(status) == 0); + free(xsave); +} +#endif + +void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) +{ + int size = PAGE_SIZE; + int sret; + + if (cpu_has_pkeys()) { + dprintf1("SKIP: %s: no CPU support\n", __func__); + return; + } + + sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey); + pkey_assert(sret < 0); +} + +void (*pkey_tests[])(int *ptr, u16 pkey) = { + test_read_of_write_disabled_region, + test_read_of_access_disabled_region, + test_read_of_access_disabled_region_with_page_already_mapped, + test_write_of_write_disabled_region, + test_write_of_write_disabled_region_with_page_already_mapped, + test_write_of_access_disabled_region, + test_write_of_access_disabled_region_with_page_already_mapped, + test_kernel_write_of_access_disabled_region, + test_kernel_write_of_write_disabled_region, + test_kernel_gup_of_access_disabled_region, + test_kernel_gup_write_to_write_disabled_region, + test_executing_on_unreadable_memory, + test_implicit_mprotect_exec_only_memory, + test_mprotect_with_pkey_0, + test_ptrace_of_child, + test_pkey_init_state, + test_pkey_syscalls_on_non_allocated_pkey, + test_pkey_syscalls_bad_args, + test_pkey_alloc_exhaust, + test_pkey_alloc_free_attach_pkey0, +#if defined(__i386__) || defined(__x86_64__) + test_ptrace_modifies_pkru, +#endif +}; + +void run_tests_once(void) +{ + int *ptr; + int prot = PROT_READ|PROT_WRITE; + + for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { + int pkey; + int orig_pkey_faults = pkey_faults; + + dprintf1("======================\n"); + dprintf1("test %d preparing...\n", test_nr); + + tracing_on(); + pkey = alloc_random_pkey(); + dprintf1("test %d starting with pkey: %d\n", test_nr, pkey); + ptr = malloc_pkey(PAGE_SIZE, prot, pkey); + dprintf1("test %d starting...\n", test_nr); + pkey_tests[test_nr](ptr, pkey); + dprintf1("freeing test memory: %p\n", ptr); + free_pkey_malloc(ptr); + sys_pkey_free(pkey); + + dprintf1("pkey_faults: %d\n", pkey_faults); + dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults); + + tracing_off(); + close_test_fds(); + + printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr); + dprintf1("======================\n\n"); + } + iteration_nr++; +} + +void pkey_setup_shadow(void) +{ + shadow_pkey_reg = __read_pkey_reg(); +} + +int main(void) +{ + int nr_iterations = 22; + int pkeys_supported = is_pkeys_supported(); + + srand((unsigned int)time(NULL)); + + setup_handlers(); + + printf("has pkeys: %d\n", pkeys_supported); + + if (!pkeys_supported) { + int size = PAGE_SIZE; + int *ptr; + + printf("running PKEY tests for unsupported CPU/OS\n"); + + ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + assert(ptr != (void *)-1); + test_mprotect_pkey_on_unsupported_cpu(ptr, 1); + exit(0); + } + + pkey_setup_shadow(); + printf("startup pkey_reg: %016llx\n", read_pkey_reg()); + setup_hugetlbfs(); + + while (nr_iterations-- > 0) + run_tests_once(); + + printf("done (all tests OK)\n"); + return 0; +} diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh new file mode 100644 index 000000000000..8984e0bb58c7 --- /dev/null +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -0,0 +1,274 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Please run as root + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +exitcode=0 + +usage() { + cat <"] + -t: specify specific categories to tests to run + -h: display this message + +The default behavior is to run all tests. + +Alternatively, specific groups tests can be run by passing a string +to the -t argument containing one or more of the following categories +separated by spaces: +- mmap + tests for mmap(2) +- gup_test + tests for gup using gup_test interface +- userfaultfd + tests for userfaultfd(2) +- compaction + a test for the patch "Allow compaction of unevictable pages" +- mlock + tests for mlock(2) +- mremap + tests for mremap(2) +- hugevm + tests for very large virtual address space +- vmalloc + vmalloc smoke tests +- hmm + hmm smoke tests +- madv_populate + test memadvise(2) MADV_POPULATE_{READ,WRITE} options +- memfd_secret + test memfd_secret(2) +- process_mrelease + test process_mrelease(2) +- ksm + ksm tests that do not require >=2 NUMA nodes +- ksm_numa + ksm tests that require >=2 NUMA nodes +- pkey + memory protection key tests +- soft_dirty + test soft dirty page bit semantics +- cow + test copy-on-write semantics +example: ./run_vmtests.sh -t "hmm mmap ksm" +EOF + exit 0 +} + + +while getopts "ht:" OPT; do + case ${OPT} in + "h") usage ;; + "t") VM_SELFTEST_ITEMS=${OPTARG} ;; + esac +done +shift $((OPTIND -1)) + +# default behavior: run all tests +VM_SELFTEST_ITEMS=${VM_SELFTEST_ITEMS:-default} + +test_selected() { + if [ "$VM_SELFTEST_ITEMS" == "default" ]; then + # If no VM_SELFTEST_ITEMS are specified, run all tests + return 0 + fi + # If test selected argument is one of the test items + if [[ " ${VM_SELFTEST_ITEMS[*]} " =~ " ${1} " ]]; then + return 0 + else + return 1 + fi +} + +# get huge pagesize and freepages from /proc/meminfo +while read -r name size unit; do + if [ "$name" = "HugePages_Free:" ]; then + freepgs="$size" + fi + if [ "$name" = "Hugepagesize:" ]; then + hpgsize_KB="$size" + fi +done < /proc/meminfo + +# Simple hugetlbfs tests have a hardcoded minimum requirement of +# huge pages totaling 256MB (262144KB) in size. The userfaultfd +# hugetlb test requires a minimum of 2 * nr_cpus huge pages. Take +# both of these requirements into account and attempt to increase +# number of huge pages available. +nr_cpus=$(nproc) +hpgsize_MB=$((hpgsize_KB / 1024)) +half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128)) +needmem_KB=$((half_ufd_size_MB * 2 * 1024)) + +# set proper nr_hugepages +if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then + nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) + needpgs=$((needmem_KB / hpgsize_KB)) + tries=2 + while [ "$tries" -gt 0 ] && [ "$freepgs" -lt "$needpgs" ]; do + lackpgs=$((needpgs - freepgs)) + echo 3 > /proc/sys/vm/drop_caches + if ! echo $((lackpgs + nr_hugepgs)) > /proc/sys/vm/nr_hugepages; then + echo "Please run this test as root" + exit $ksft_skip + fi + while read -r name size unit; do + if [ "$name" = "HugePages_Free:" ]; then + freepgs=$size + fi + done < /proc/meminfo + tries=$((tries - 1)) + done + if [ "$freepgs" -lt "$needpgs" ]; then + printf "Not enough huge pages available (%d < %d)\n" \ + "$freepgs" "$needpgs" + exit 1 + fi +else + echo "no hugetlbfs support in kernel?" + exit 1 +fi + +# filter 64bit architectures +ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64" +if [ -z "$ARCH" ]; then + ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/') +fi +VADDR64=0 +echo "$ARCH64STR" | grep "$ARCH" &>/dev/null && VADDR64=1 + +# Usage: run_test [test binary] [arbitrary test arguments...] +run_test() { + if test_selected ${CATEGORY}; then + local title="running $*" + local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -) + printf "%s\n%s\n%s\n" "$sep" "$title" "$sep" + + "$@" + local ret=$? + if [ $ret -eq 0 ]; then + echo "[PASS]" + elif [ $ret -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip + else + echo "[FAIL]" + exitcode=1 + fi + fi # test_selected +} + +CATEGORY="hugetlb" run_test ./hugepage-mmap + +shmmax=$(cat /proc/sys/kernel/shmmax) +shmall=$(cat /proc/sys/kernel/shmall) +echo 268435456 > /proc/sys/kernel/shmmax +echo 4194304 > /proc/sys/kernel/shmall +CATEGORY="hugetlb" run_test ./hugepage-shm +echo "$shmmax" > /proc/sys/kernel/shmmax +echo "$shmall" > /proc/sys/kernel/shmall + +CATEGORY="hugetlb" run_test ./map_hugetlb +CATEGORY="hugetlb" run_test ./hugepage-mremap +CATEGORY="hugetlb" run_test ./hugepage-vmemmap +CATEGORY="hugetlb" run_test ./hugetlb-madvise + +if test_selected "hugetlb"; then + echo "NOTE: These hugetlb tests provide minimal coverage. Use" + echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" + echo " hugetlb regression testing." +fi + +CATEGORY="mmap" run_test ./map_fixed_noreplace + +# get_user_pages_fast() benchmark +CATEGORY="gup_test" run_test ./gup_test -u +# pin_user_pages_fast() benchmark +CATEGORY="gup_test" run_test ./gup_test -a +# Dump pages 0, 19, and 4096, using pin_user_pages: +CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000 + +uffd_mods=("" ":dev") +for mod in "${uffd_mods[@]}"; do + CATEGORY="userfaultfd" run_test ./userfaultfd anon${mod} 20 16 + # Hugetlb tests require source and destination huge pages. Pass in half + # the size ($half_ufd_size_MB), which is used for *each*. + CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32 + CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32 + CATEGORY="userfaultfd" run_test ./userfaultfd shmem${mod} 20 16 +done + +#cleanup +echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages + +CATEGORY="compaction" run_test ./compaction_test + +CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit + +CATEGORY="mmap" run_test ./map_populate + +CATEGORY="mlock" run_test ./mlock-random-test + +CATEGORY="mlock" run_test ./mlock2-tests + +CATEGORY="process_mrelease" run_test ./mrelease_test + +CATEGORY="mremap" run_test ./mremap_test + +CATEGORY="hugetlb" run_test ./thuge-gen + +if [ $VADDR64 -ne 0 ]; then + CATEGORY="hugevm" run_test ./virtual_address_range + + # virtual address 128TB switch test + CATEGORY="hugevm" run_test ./va_128TBswitch.sh +fi # VADDR64 + +# vmalloc stability smoke test +CATEGORY="vmalloc" run_test ./test_vmalloc.sh smoke + +CATEGORY="mremap" run_test ./mremap_dontunmap + +CATEGORY="hmm" run_test ./test_hmm.sh smoke + +# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests +CATEGORY="madv_populate" run_test ./madv_populate + +CATEGORY="memfd_secret" run_test ./memfd_secret + +# KSM MADV_MERGEABLE test with 10 identical pages +CATEGORY="ksm" run_test ./ksm_tests -M -p 10 +# KSM unmerge test +CATEGORY="ksm" run_test ./ksm_tests -U +# KSM test with 10 zero pages and use_zero_pages = 0 +CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 0 +# KSM test with 10 zero pages and use_zero_pages = 1 +CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 1 +# KSM test with 2 NUMA nodes and merge_across_nodes = 1 +CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 1 +# KSM test with 2 NUMA nodes and merge_across_nodes = 0 +CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0 + +CATEGORY="ksm" run_test ./ksm_functional_tests + +run_test ./ksm_functional_tests + +# protection_keys tests +if [ -x ./protection_keys_32 ] +then + CATEGORY="pkey" run_test ./protection_keys_32 +fi + +if [ -x ./protection_keys_64 ] +then + CATEGORY="pkey" run_test ./protection_keys_64 +fi + +CATEGORY="soft_dirty" run_test ./soft-dirty + +# COW tests +CATEGORY="cow" run_test ./cow + +exit $exitcode diff --git a/tools/testing/selftests/mm/settings b/tools/testing/selftests/mm/settings new file mode 100644 index 000000000000..9abfc60e9e6f --- /dev/null +++ b/tools/testing/selftests/mm/settings @@ -0,0 +1 @@ +timeout=45 diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c new file mode 100644 index 000000000000..21d8830c5f24 --- /dev/null +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include "../kselftest.h" +#include "vm_util.h" + +#define PAGEMAP_FILE_PATH "/proc/self/pagemap" +#define TEST_ITERATIONS 10000 + +static void test_simple(int pagemap_fd, int pagesize) +{ + int i; + char *map; + + map = aligned_alloc(pagesize, pagesize); + if (!map) + ksft_exit_fail_msg("mmap failed\n"); + + clear_softdirty(); + + for (i = 0 ; i < TEST_ITERATIONS; i++) { + if (pagemap_is_softdirty(pagemap_fd, map) == 1) { + ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i); + break; + } + + clear_softdirty(); + // Write something to the page to get the dirty bit enabled on the page + map[0]++; + + if (pagemap_is_softdirty(pagemap_fd, map) == 0) { + ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i); + break; + } + + clear_softdirty(); + } + free(map); + + ksft_test_result(i == TEST_ITERATIONS, "Test %s\n", __func__); +} + +static void test_vma_reuse(int pagemap_fd, int pagesize) +{ + char *map, *map2; + + map = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap failed"); + + // The kernel always marks new regions as soft dirty + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s dirty bit of allocated page\n", __func__); + + clear_softdirty(); + munmap(map, pagesize); + + map2 = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0); + if (map2 == MAP_FAILED) + ksft_exit_fail_msg("mmap failed"); + + // Dirty bit is set for new regions even if they are reused + if (map == map2) + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1, + "Test %s dirty bit of reused address page\n", __func__); + else + ksft_test_result_skip("Test %s dirty bit of reused address page\n", __func__); + + munmap(map2, pagesize); +} + +static void test_hugepage(int pagemap_fd, int pagesize) +{ + char *map; + int i, ret; + size_t hpage_len = read_pmd_pagesize(); + + map = memalign(hpage_len, hpage_len); + if (!map) + ksft_exit_fail_msg("memalign failed\n"); + + ret = madvise(map, hpage_len, MADV_HUGEPAGE); + if (ret) + ksft_exit_fail_msg("madvise failed %d\n", ret); + + for (i = 0; i < hpage_len; i++) + map[i] = (char)i; + + if (check_huge_anon(map, 1, hpage_len)) { + ksft_test_result_pass("Test %s huge page allocation\n", __func__); + + clear_softdirty(); + for (i = 0 ; i < TEST_ITERATIONS ; i++) { + if (pagemap_is_softdirty(pagemap_fd, map) == 1) { + ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i); + break; + } + + clear_softdirty(); + // Write something to the page to get the dirty bit enabled on the page + map[0]++; + + if (pagemap_is_softdirty(pagemap_fd, map) == 0) { + ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i); + break; + } + clear_softdirty(); + } + + ksft_test_result(i == TEST_ITERATIONS, "Test %s huge page dirty bit\n", __func__); + } else { + // hugepage allocation failed. skip these tests + ksft_test_result_skip("Test %s huge page allocation\n", __func__); + ksft_test_result_skip("Test %s huge page dirty bit\n", __func__); + } + free(map); +} + +static void test_mprotect(int pagemap_fd, int pagesize, bool anon) +{ + const char *type[] = {"file", "anon"}; + const char *fname = "./soft-dirty-test-file"; + int test_fd; + char *map; + + if (anon) { + map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if (!map) + ksft_exit_fail_msg("anon mmap failed\n"); + } else { + test_fd = open(fname, O_RDWR | O_CREAT); + if (test_fd < 0) { + ksft_test_result_skip("Test %s open() file failed\n", __func__); + return; + } + unlink(fname); + ftruncate(test_fd, pagesize); + map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, + MAP_SHARED, test_fd, 0); + if (!map) + ksft_exit_fail_msg("file mmap failed\n"); + } + + *map = 1; + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-%s dirty bit of new written page\n", + __func__, type[anon]); + clear_softdirty(); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, + "Test %s-%s soft-dirty clear after clear_refs\n", + __func__, type[anon]); + mprotect(map, pagesize, PROT_READ); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, + "Test %s-%s soft-dirty clear after marking RO\n", + __func__, type[anon]); + mprotect(map, pagesize, PROT_READ|PROT_WRITE); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, + "Test %s-%s soft-dirty clear after marking RW\n", + __func__, type[anon]); + *map = 2; + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-%s soft-dirty after rewritten\n", + __func__, type[anon]); + + munmap(map, pagesize); + + if (!anon) + close(test_fd); +} + +static void test_mprotect_anon(int pagemap_fd, int pagesize) +{ + test_mprotect(pagemap_fd, pagesize, true); +} + +static void test_mprotect_file(int pagemap_fd, int pagesize) +{ + test_mprotect(pagemap_fd, pagesize, false); +} + +int main(int argc, char **argv) +{ + int pagemap_fd; + int pagesize; + + ksft_print_header(); + ksft_set_plan(15); + + pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH); + + pagesize = getpagesize(); + + test_simple(pagemap_fd, pagesize); + test_vma_reuse(pagemap_fd, pagesize); + test_hugepage(pagemap_fd, pagesize); + test_mprotect_anon(pagemap_fd, pagesize); + test_mprotect_file(pagemap_fd, pagesize); + + close(pagemap_fd); + + return ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c new file mode 100644 index 000000000000..76e1c36dd9e5 --- /dev/null +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -0,0 +1,309 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual + * address range in a process via /split_huge_pages interface. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "vm_util.h" + +uint64_t pagesize; +unsigned int pageshift; +uint64_t pmd_pagesize; + +#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages" +#define INPUT_MAX 80 + +#define PID_FMT "%d,0x%lx,0x%lx" +#define PATH_FMT "%s,0x%lx,0x%lx" + +#define PFN_MASK ((1UL<<55)-1) +#define KPF_THP (1UL<<22) + +int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file) +{ + uint64_t paddr; + uint64_t page_flags; + + if (pagemap_file) { + pread(pagemap_file, &paddr, sizeof(paddr), + ((long)vaddr >> pageshift) * sizeof(paddr)); + + if (kpageflags_file) { + pread(kpageflags_file, &page_flags, sizeof(page_flags), + (paddr & PFN_MASK) * sizeof(page_flags)); + + return !!(page_flags & KPF_THP); + } + } + return 0; +} + +static int write_file(const char *path, const char *buf, size_t buflen) +{ + int fd; + ssize_t numwritten; + + fd = open(path, O_WRONLY); + if (fd == -1) + return 0; + + numwritten = write(fd, buf, buflen - 1); + close(fd); + if (numwritten < 1) + return 0; + + return (unsigned int) numwritten; +} + +static void write_debugfs(const char *fmt, ...) +{ + char input[INPUT_MAX]; + int ret; + va_list argp; + + va_start(argp, fmt); + ret = vsnprintf(input, INPUT_MAX, fmt, argp); + va_end(argp); + + if (ret >= INPUT_MAX) { + printf("%s: Debugfs input is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) { + perror(SPLIT_DEBUGFS); + exit(EXIT_FAILURE); + } +} + +void split_pmd_thp(void) +{ + char *one_page; + size_t len = 4 * pmd_pagesize; + size_t i; + + one_page = memalign(pmd_pagesize, len); + + if (!one_page) { + printf("Fail to allocate memory\n"); + exit(EXIT_FAILURE); + } + + madvise(one_page, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + one_page[i] = (char)i; + + if (!check_huge_anon(one_page, 1, pmd_pagesize)) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + /* split all THPs */ + write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, + (uint64_t)one_page + len); + + for (i = 0; i < len; i++) + if (one_page[i] != (char)i) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + + + if (check_huge_anon(one_page, 0, pmd_pagesize)) { + printf("Still AnonHugePages not split\n"); + exit(EXIT_FAILURE); + } + + printf("Split huge pages successful\n"); + free(one_page); +} + +void split_pte_mapped_thp(void) +{ + char *one_page, *pte_mapped, *pte_mapped2; + size_t len = 4 * pmd_pagesize; + uint64_t thp_size; + size_t i; + const char *pagemap_template = "/proc/%d/pagemap"; + const char *kpageflags_proc = "/proc/kpageflags"; + char pagemap_proc[255]; + int pagemap_fd; + int kpageflags_fd; + + if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) { + perror("get pagemap proc error"); + exit(EXIT_FAILURE); + } + pagemap_fd = open(pagemap_proc, O_RDONLY); + + if (pagemap_fd == -1) { + perror("read pagemap:"); + exit(EXIT_FAILURE); + } + + kpageflags_fd = open(kpageflags_proc, O_RDONLY); + + if (kpageflags_fd == -1) { + perror("read kpageflags:"); + exit(EXIT_FAILURE); + } + + one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + madvise(one_page, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + one_page[i] = (char)i; + + if (!check_huge_anon(one_page, 1, pmd_pagesize)) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + /* remap the first pagesize of first THP */ + pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE); + + /* remap the Nth pagesize of Nth THP */ + for (i = 1; i < 4; i++) { + pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i, + pagesize, pagesize, + MREMAP_MAYMOVE|MREMAP_FIXED, + pte_mapped + pagesize * i); + if (pte_mapped2 == (char *)-1) { + perror("mremap failed"); + exit(EXIT_FAILURE); + } + } + + /* smap does not show THPs after mremap, use kpageflags instead */ + thp_size = 0; + for (i = 0; i < pagesize * 4; i++) + if (i % pagesize == 0 && + is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) + thp_size++; + + if (thp_size != 4) { + printf("Some THPs are missing during mremap\n"); + exit(EXIT_FAILURE); + } + + /* split all remapped THPs */ + write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped, + (uint64_t)pte_mapped + pagesize * 4); + + /* smap does not show THPs after mremap, use kpageflags instead */ + thp_size = 0; + for (i = 0; i < pagesize * 4; i++) { + if (pte_mapped[i] != (char)i) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + if (i % pagesize == 0 && + is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) + thp_size++; + } + + if (thp_size) { + printf("Still %ld THPs not split\n", thp_size); + exit(EXIT_FAILURE); + } + + printf("Split PTE-mapped huge pages successful\n"); + munmap(one_page, len); + close(pagemap_fd); + close(kpageflags_fd); +} + +void split_file_backed_thp(void) +{ + int status; + int fd; + ssize_t num_written; + char tmpfs_template[] = "/tmp/thp_split_XXXXXX"; + const char *tmpfs_loc = mkdtemp(tmpfs_template); + char testfile[INPUT_MAX]; + uint64_t pgoff_start = 0, pgoff_end = 1024; + + printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n"); + + status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m"); + + if (status) { + printf("Unable to create a tmpfs for testing\n"); + exit(EXIT_FAILURE); + } + + status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc); + if (status >= INPUT_MAX) { + printf("Fail to create file-backed THP split testing file\n"); + goto cleanup; + } + + fd = open(testfile, O_CREAT|O_WRONLY); + if (fd == -1) { + perror("Cannot open testing file\n"); + goto cleanup; + } + + /* write something to the file, so a file-backed THP can be allocated */ + num_written = write(fd, tmpfs_loc, strlen(tmpfs_loc) + 1); + close(fd); + + if (num_written < 1) { + printf("Fail to write data to testing file\n"); + goto cleanup; + } + + /* split the file-backed THP */ + write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end); + + status = unlink(testfile); + if (status) + perror("Cannot remove testing file\n"); + +cleanup: + status = umount(tmpfs_loc); + if (status) { + printf("Unable to umount %s\n", tmpfs_loc); + exit(EXIT_FAILURE); + } + status = rmdir(tmpfs_loc); + if (status) { + perror("cannot remove tmp dir"); + exit(EXIT_FAILURE); + } + + printf("file-backed THP split test done, please check dmesg for more information\n"); +} + +int main(int argc, char **argv) +{ + if (geteuid() != 0) { + printf("Please run the benchmark as root\n"); + exit(EXIT_FAILURE); + } + + pagesize = getpagesize(); + pageshift = ffs(pagesize) - 1; + pmd_pagesize = read_pmd_pagesize(); + + split_pmd_thp(); + split_pte_mapped_thp(); + split_file_backed_thp(); + + return 0; +} diff --git a/tools/testing/selftests/mm/test_hmm.sh b/tools/testing/selftests/mm/test_hmm.sh new file mode 100644 index 000000000000..46e19b5d648d --- /dev/null +++ b/tools/testing/selftests/mm/test_hmm.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2018 Uladzislau Rezki (Sony) +# +# This is a test script for the kernel test driver to analyse vmalloc +# allocator. Therefore it is just a kernel module loader. You can specify +# and pass different parameters in order to: +# a) analyse performance of vmalloc allocations; +# b) stressing and stability check of vmalloc subsystem. + +TEST_NAME="test_hmm" +DRIVER="test_hmm" + +# 1 if fails +exitcode=1 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +check_test_requirements() +{ + uid=$(id -u) + if [ $uid -ne 0 ]; then + echo "$0: Must be run as root" + exit $ksft_skip + fi + + if ! which modprobe > /dev/null 2>&1; then + echo "$0: You need modprobe installed" + exit $ksft_skip + fi + + if ! modinfo $DRIVER > /dev/null 2>&1; then + echo "$0: You must have the following enabled in your kernel:" + echo "CONFIG_TEST_HMM=m" + exit $ksft_skip + fi +} + +load_driver() +{ + if [ $# -eq 0 ]; then + modprobe $DRIVER > /dev/null 2>&1 + else + if [ $# -eq 2 ]; then + modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2 + > /dev/null 2>&1 + else + echo "Missing module parameters. Make sure pass"\ + "spm_addr_dev0 and spm_addr_dev1" + usage + fi + fi +} + +unload_driver() +{ + modprobe -r $DRIVER > /dev/null 2>&1 +} + +run_smoke() +{ + echo "Running smoke test. Note, this test provides basic coverage." + + load_driver $1 $2 + $(dirname "${BASH_SOURCE[0]}")/hmm-tests + unload_driver +} + +usage() +{ + echo -n "Usage: $0" + echo + echo "Example usage:" + echo + echo "# Shows help message" + echo "./${TEST_NAME}.sh" + echo + echo "# Smoke testing" + echo "./${TEST_NAME}.sh smoke" + echo + echo "# Smoke testing with SPM enabled" + echo "./${TEST_NAME}.sh smoke " + echo + exit 0 +} + +function run_test() +{ + if [ $# -eq 0 ]; then + usage + else + if [ "$1" = "smoke" ]; then + run_smoke $2 $3 + else + usage + fi + fi +} + +check_test_requirements +run_test $@ + +exit 0 diff --git a/tools/testing/selftests/mm/test_vmalloc.sh b/tools/testing/selftests/mm/test_vmalloc.sh new file mode 100644 index 000000000000..d73b846736f1 --- /dev/null +++ b/tools/testing/selftests/mm/test_vmalloc.sh @@ -0,0 +1,177 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2018 Uladzislau Rezki (Sony) +# +# This is a test script for the kernel test driver to analyse vmalloc +# allocator. Therefore it is just a kernel module loader. You can specify +# and pass different parameters in order to: +# a) analyse performance of vmalloc allocations; +# b) stressing and stability check of vmalloc subsystem. + +TEST_NAME="vmalloc" +DRIVER="test_${TEST_NAME}" +NUM_CPUS=`grep -c ^processor /proc/cpuinfo` + +# 1 if fails +exitcode=1 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +# +# Static templates for performance, stressing and smoke tests. +# Also it is possible to pass any supported parameters manualy. +# +PERF_PARAM="sequential_test_order=1 test_repeat_count=3" +SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10" +STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20" + +check_test_requirements() +{ + uid=$(id -u) + if [ $uid -ne 0 ]; then + echo "$0: Must be run as root" + exit $ksft_skip + fi + + if ! which modprobe > /dev/null 2>&1; then + echo "$0: You need modprobe installed" + exit $ksft_skip + fi + + if ! modinfo $DRIVER > /dev/null 2>&1; then + echo "$0: You must have the following enabled in your kernel:" + echo "CONFIG_TEST_VMALLOC=m" + exit $ksft_skip + fi +} + +run_perfformance_check() +{ + echo "Run performance tests to evaluate how fast vmalloc allocation is." + echo "It runs all test cases on one single CPU with sequential order." + + modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1 + echo "Done." + echo "Ccheck the kernel message buffer to see the summary." +} + +run_stability_check() +{ + echo "Run stability tests. In order to stress vmalloc subsystem all" + echo "available test cases are run by NUM_CPUS workers simultaneously." + echo "It will take time, so be patient." + + modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1 + echo "Done." + echo "Check the kernel ring buffer to see the summary." +} + +run_smoke_check() +{ + echo "Run smoke test. Note, this test provides basic coverage." + echo "Please check $0 output how it can be used" + echo "for deep performance analysis as well as stress testing." + + modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1 + echo "Done." + echo "Check the kernel ring buffer to see the summary." +} + +usage() +{ + echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | " + echo "manual parameters" + echo + echo "Valid tests and parameters:" + echo + modinfo $DRIVER + echo + echo "Example usage:" + echo + echo "# Shows help message" + echo "./${DRIVER}.sh" + echo + echo "# Runs 1 test(id_1), repeats it 5 times by NUM_CPUS workers" + echo "./${DRIVER}.sh nr_threads=$NUM_CPUS run_test_mask=1 test_repeat_count=5" + echo + echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with " + echo "sequential order" + echo -n "./${DRIVER}.sh sequential_test_order=1 " + echo "run_test_mask=23" + echo + echo -n "# Runs all tests by NUM_CPUS workers, shuffled order, repeats " + echo "20 times" + echo "./${DRIVER}.sh nr_threads=$NUM_CPUS test_repeat_count=20" + echo + echo "# Performance analysis" + echo "./${DRIVER}.sh performance" + echo + echo "# Stress testing" + echo "./${DRIVER}.sh stress" + echo + exit 0 +} + +function validate_passed_args() +{ + VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'` + + # + # Something has been passed, check it. + # + for passed_arg in $@; do + key=${passed_arg//=*/} + val="${passed_arg:$((${#key}+1))}" + valid=0 + + for valid_arg in $VALID_ARGS; do + if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then + valid=1 + break + fi + done + + if [[ $valid -ne 1 ]]; then + echo "Error: key or value is not correct: ${key} $val" + exit $exitcode + fi + done +} + +function run_manual_check() +{ + # + # Validate passed parameters. If there is wrong one, + # the script exists and does not execute further. + # + validate_passed_args $@ + + echo "Run the test with following parameters: $@" + modprobe $DRIVER $@ > /dev/null 2>&1 + echo "Done." + echo "Check the kernel ring buffer to see the summary." +} + +function run_test() +{ + if [ $# -eq 0 ]; then + usage + else + if [[ "$1" = "performance" ]]; then + run_perfformance_check + elif [[ "$1" = "stress" ]]; then + run_stability_check + elif [[ "$1" = "smoke" ]]; then + run_smoke_check + else + run_manual_check $@ + fi + fi +} + +check_test_requirements +run_test $@ + +exit 0 diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c new file mode 100644 index 000000000000..361ef7192cc6 --- /dev/null +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -0,0 +1,257 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Test selecting other page sizes for mmap/shmget. + + Before running this huge pages for each huge page size must have been + reserved. + For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used. + Also shmmax must be increased. + And you need to run as root to work around some weird permissions in shm. + And nothing using huge pages should run in parallel. + When the program aborts you may need to clean up the shm segments with + ipcrm -m by hand, like this + sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m + (warning this will remove all if someone else uses them) */ + +#define _GNU_SOURCE 1 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define err(x) perror(x), exit(1) + +#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) +#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) +#define MAP_HUGE_SHIFT 26 +#define MAP_HUGE_MASK 0x3f +#if !defined(MAP_HUGETLB) +#define MAP_HUGETLB 0x40000 +#endif + +#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */ +#define SHM_HUGE_SHIFT 26 +#define SHM_HUGE_MASK 0x3f +#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT) +#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) + +#define NUM_PAGESIZES 5 + +#define NUM_PAGES 4 + +#define Dprintf(fmt...) // printf(fmt) + +unsigned long page_sizes[NUM_PAGESIZES]; +int num_page_sizes; + +int ilog2(unsigned long v) +{ + int l = 0; + while ((1UL << l) < v) + l++; + return l; +} + +void find_pagesizes(void) +{ + glob_t g; + int i; + glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g); + assert(g.gl_pathc <= NUM_PAGESIZES); + for (i = 0; i < g.gl_pathc; i++) { + sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB", + &page_sizes[i]); + page_sizes[i] <<= 10; + printf("Found %luMB\n", page_sizes[i] >> 20); + } + num_page_sizes = g.gl_pathc; + globfree(&g); +} + +unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + free(line); + return hps; +} + +void show(unsigned long ps) +{ + char buf[100]; + if (ps == getpagesize()) + return; + printf("%luMB: ", ps >> 20); + fflush(stdout); + snprintf(buf, sizeof buf, + "cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", + ps >> 10); + system(buf); +} + +unsigned long read_sysfs(int warn, char *fmt, ...) +{ + char *line = NULL; + size_t linelen = 0; + char buf[100]; + FILE *f; + va_list ap; + unsigned long val = 0; + + va_start(ap, fmt); + vsnprintf(buf, sizeof buf, fmt, ap); + va_end(ap); + + f = fopen(buf, "r"); + if (!f) { + if (warn) + printf("missing %s\n", buf); + return 0; + } + if (getline(&line, &linelen, f) > 0) { + sscanf(line, "%lu", &val); + } + fclose(f); + free(line); + return val; +} + +unsigned long read_free(unsigned long ps) +{ + return read_sysfs(ps != getpagesize(), + "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", + ps >> 10); +} + +void test_mmap(unsigned long size, unsigned flags) +{ + char *map; + unsigned long before, after; + int err; + + before = read_free(size); + map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0); + + if (map == (char *)-1) err("mmap"); + memset(map, 0xff, size*NUM_PAGES); + after = read_free(size); + Dprintf("before %lu after %lu diff %ld size %lu\n", + before, after, before - after, size); + assert(size == getpagesize() || (before - after) == NUM_PAGES); + show(size); + err = munmap(map, size); + assert(!err); +} + +void test_shmget(unsigned long size, unsigned flags) +{ + int id; + unsigned long before, after; + int err; + + before = read_free(size); + id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags); + if (id < 0) err("shmget"); + + struct shm_info i; + if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl"); + Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss); + + + Dprintf("id %d\n", id); + char *map = shmat(id, NULL, 0600); + if (map == (char*)-1) err("shmat"); + + shmctl(id, IPC_RMID, NULL); + + memset(map, 0xff, size*NUM_PAGES); + after = read_free(size); + + Dprintf("before %lu after %lu diff %ld size %lu\n", + before, after, before - after, size); + assert(size == getpagesize() || (before - after) == NUM_PAGES); + show(size); + err = shmdt(map); + assert(!err); +} + +void sanity_checks(void) +{ + int i; + unsigned long largest = getpagesize(); + + for (i = 0; i < num_page_sizes; i++) { + if (page_sizes[i] > largest) + largest = page_sizes[i]; + + if (read_free(page_sizes[i]) < NUM_PAGES) { + printf("Not enough huge pages for page size %lu MB, need %u\n", + page_sizes[i] >> 20, + NUM_PAGES); + exit(0); + } + } + + if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) { + printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES); + exit(0); + } + +#if defined(__x86_64__) + if (largest != 1U<<30) { + printf("No GB pages available on x86-64\n" + "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES); + exit(0); + } +#endif +} + +int main(void) +{ + int i; + unsigned default_hps = default_huge_page_size(); + + find_pagesizes(); + + sanity_checks(); + + for (i = 0; i < num_page_sizes; i++) { + unsigned long ps = page_sizes[i]; + int arg = ilog2(ps) << MAP_HUGE_SHIFT; + printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg); + test_mmap(ps, MAP_HUGETLB | arg); + } + printf("Testing default huge mmap\n"); + test_mmap(default_hps, SHM_HUGETLB); + + puts("Testing non-huge shmget"); + test_shmget(getpagesize(), 0); + + for (i = 0; i < num_page_sizes; i++) { + unsigned long ps = page_sizes[i]; + int arg = ilog2(ps) << SHM_HUGE_SHIFT; + printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg); + test_shmget(ps, SHM_HUGETLB | arg); + } + puts("default huge shmget"); + test_shmget(default_hps, SHM_HUGETLB); + + return 0; +} diff --git a/tools/testing/selftests/mm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c new file mode 100644 index 000000000000..e3f00adb1b82 --- /dev/null +++ b/tools/testing/selftests/mm/transhuge-stress.c @@ -0,0 +1,122 @@ +/* + * Stress test for transparent huge pages, memory compaction and migration. + * + * Authors: Konstantin Khlebnikov + * + * This is free and unencumbered software released into the public domain. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util.h" + +int backing_fd = -1; +int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE; +#define PROT_RW (PROT_READ | PROT_WRITE) + +int main(int argc, char **argv) +{ + size_t ram, len; + void *ptr, *p; + struct timespec a, b; + int i = 0; + char *name = NULL; + double s; + uint8_t *map; + size_t map_len; + int pagemap_fd; + + ram = sysconf(_SC_PHYS_PAGES); + if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4) + ram = SIZE_MAX / 4; + else + ram *= sysconf(_SC_PAGESIZE); + len = ram; + + while (++i < argc) { + if (!strcmp(argv[i], "-h")) + errx(1, "usage: %s [size in MiB]", argv[0]); + else if (!strcmp(argv[i], "-f")) + name = argv[++i]; + else + len = atoll(argv[i]) << 20; + } + + if (name) { + backing_fd = open(name, O_RDWR); + if (backing_fd == -1) + errx(2, "open %s", name); + mmap_flags = MAP_SHARED; + } + + warnx("allocate %zd transhuge pages, using %zd MiB virtual memory" + " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20, + ram >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1)); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + err(2, "open pagemap"); + + len -= len % HPAGE_SIZE; + ptr = mmap(NULL, len + HPAGE_SIZE, PROT_RW, mmap_flags, backing_fd, 0); + if (ptr == MAP_FAILED) + err(2, "initial mmap"); + ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE; + + if (madvise(ptr, len, MADV_HUGEPAGE)) + err(2, "MADV_HUGEPAGE"); + + map_len = ram >> (HPAGE_SHIFT - 1); + map = malloc(map_len); + if (!map) + errx(2, "map malloc"); + + while (1) { + int nr_succeed = 0, nr_failed = 0, nr_pages = 0; + + memset(map, 0, map_len); + + clock_gettime(CLOCK_MONOTONIC, &a); + for (p = ptr; p < ptr + len; p += HPAGE_SIZE) { + int64_t pfn; + + pfn = allocate_transhuge(p, pagemap_fd); + + if (pfn < 0) { + nr_failed++; + } else { + size_t idx = pfn >> (HPAGE_SHIFT - PAGE_SHIFT); + + nr_succeed++; + if (idx >= map_len) { + map = realloc(map, idx + 1); + if (!map) + errx(2, "map realloc"); + memset(map + map_len, 0, idx + 1 - map_len); + map_len = idx + 1; + } + if (!map[idx]) + nr_pages++; + map[idx] = 1; + } + + /* split transhuge page, keep last page */ + if (madvise(p, HPAGE_SIZE - PAGE_SIZE, MADV_DONTNEED)) + err(2, "MADV_DONTNEED"); + } + clock_gettime(CLOCK_MONOTONIC, &b); + s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.; + + warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t" + "%4d succeed, %4d failed, %4d different pages", + s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20), + nr_succeed, nr_failed, nr_pages); + } +} diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c new file mode 100644 index 000000000000..7f22844ed704 --- /dev/null +++ b/tools/testing/selftests/mm/userfaultfd.c @@ -0,0 +1,1858 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Stress userfaultfd syscall. + * + * Copyright (C) 2015 Red Hat, Inc. + * + * This test allocates two virtual areas and bounces the physical + * memory across the two virtual areas (from area_src to area_dst) + * using userfaultfd. + * + * There are three threads running per CPU: + * + * 1) one per-CPU thread takes a per-page pthread_mutex in a random + * page of the area_dst (while the physical page may still be in + * area_src), and increments a per-page counter in the same page, + * and checks its value against a verification region. + * + * 2) another per-CPU thread handles the userfaults generated by + * thread 1 above. userfaultfd blocking reads or poll() modes are + * exercised interleaved. + * + * 3) one last per-CPU thread transfers the memory in the background + * at maximum bandwidth (if not already transferred by thread + * 2). Each cpu thread takes cares of transferring a portion of the + * area. + * + * When all threads of type 3 completed the transfer, one bounce is + * complete. area_src and area_dst are then swapped. All threads are + * respawned and so the bounce is immediately restarted in the + * opposite direction. + * + * per-CPU threads 1 by triggering userfaults inside + * pthread_mutex_lock will also verify the atomicity of the memory + * transfer (UFFDIO_COPY). + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" +#include "vm_util.h" + +#ifdef __NR_userfaultfd + +static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; + +#define BOUNCE_RANDOM (1<<0) +#define BOUNCE_RACINGFAULTS (1<<1) +#define BOUNCE_VERIFY (1<<2) +#define BOUNCE_POLL (1<<3) +static int bounces; + +#define TEST_ANON 1 +#define TEST_HUGETLB 2 +#define TEST_SHMEM 3 +static int test_type; + +#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) + +#define BASE_PMD_ADDR ((void *)(1UL << 30)) + +/* test using /dev/userfaultfd, instead of userfaultfd(2) */ +static bool test_dev_userfaultfd; + +/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ +#define ALARM_INTERVAL_SECS 10 +static volatile bool test_uffdio_copy_eexist = true; +static volatile bool test_uffdio_zeropage_eexist = true; +/* Whether to test uffd write-protection */ +static bool test_uffdio_wp = true; +/* Whether to test uffd minor faults */ +static bool test_uffdio_minor = false; +static bool map_shared; +static int mem_fd; +static unsigned long long *count_verify; +static int uffd = -1; +static int uffd_flags, finished, *pipefd; +static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; +static char *zeropage; +pthread_attr_t attr; +static bool test_collapse; + +/* Userfaultfd test statistics */ +struct uffd_stats { + int cpu; + unsigned long missing_faults; + unsigned long wp_faults; + unsigned long minor_faults; +}; + +/* pthread_mutex_t starts at page offset 0 */ +#define area_mutex(___area, ___nr) \ + ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) +/* + * count is placed in the page after pthread_mutex_t naturally aligned + * to avoid non alignment faults on non-x86 archs. + */ +#define area_count(___area, ___nr) \ + ((volatile unsigned long long *) ((unsigned long) \ + ((___area) + (___nr)*page_size + \ + sizeof(pthread_mutex_t) + \ + sizeof(unsigned long long) - 1) & \ + ~(unsigned long)(sizeof(unsigned long long) \ + - 1))) + +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + +#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1))) + +const char *examples = + "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" + "./userfaultfd anon 100 99999\n\n" + "# Run the same anonymous memory test, but using /dev/userfaultfd:\n" + "./userfaultfd anon:dev 100 99999\n\n" + "# Run share memory test on 1GiB region with 99 bounces:\n" + "./userfaultfd shmem 1000 99\n\n" + "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" + "./userfaultfd hugetlb 256 50\n\n" + "# Run the same hugetlb test but using shared file:\n" + "./userfaultfd hugetlb_shared 256 50\n\n" + "# 10MiB-~6GiB 999 bounces anonymous test, " + "continue forever unless an error triggers\n" + "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; + +static void usage(void) +{ + fprintf(stderr, "\nUsage: ./userfaultfd " + "[hugetlbfs_file]\n\n"); + fprintf(stderr, "Supported : anon, hugetlb, " + "hugetlb_shared, shmem\n\n"); + fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. " + "Supported mods:\n"); + fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n"); + fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n"); + fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n" + "memory\n"); + fprintf(stderr, "\nExample test mod usage:\n"); + fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n"); + fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n"); + + fprintf(stderr, "Examples:\n\n"); + fprintf(stderr, "%s", examples); + exit(1); +} + +#define _err(fmt, ...) \ + do { \ + int ret = errno; \ + fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ + fprintf(stderr, " (errno=%d, line=%d)\n", \ + ret, __LINE__); \ + } while (0) + +#define errexit(exitcode, fmt, ...) \ + do { \ + _err(fmt, ##__VA_ARGS__); \ + exit(exitcode); \ + } while (0) + +#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) + +static void uffd_stats_reset(struct uffd_stats *uffd_stats, + unsigned long n_cpus) +{ + int i; + + for (i = 0; i < n_cpus; i++) { + uffd_stats[i].cpu = i; + uffd_stats[i].missing_faults = 0; + uffd_stats[i].wp_faults = 0; + uffd_stats[i].minor_faults = 0; + } +} + +static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) +{ + int i; + unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; + + for (i = 0; i < n_cpus; i++) { + miss_total += stats[i].missing_faults; + wp_total += stats[i].wp_faults; + minor_total += stats[i].minor_faults; + } + + printf("userfaults: "); + if (miss_total) { + printf("%llu missing (", miss_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].missing_faults); + printf("\b) "); + } + if (wp_total) { + printf("%llu wp (", wp_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].wp_faults); + printf("\b) "); + } + if (minor_total) { + printf("%llu minor (", minor_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].minor_faults); + printf("\b)"); + } + printf("\n"); +} + +static void anon_release_pages(char *rel_area) +{ + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); +} + +static void anon_allocate_area(void **alloc_area, bool is_src) +{ + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); +} + +static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ +} + +static void hugetlb_release_pages(char *rel_area) +{ + if (!map_shared) { + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); + } else { + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) + err("madvise(MADV_REMOVE) failed"); + } +} + +static void hugetlb_allocate_area(void **alloc_area, bool is_src) +{ + off_t size = nr_pages * page_size; + off_t offset = is_src ? 0 : size; + void *area_alias = NULL; + char **alloc_area_alias; + + *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, + (map_shared ? MAP_SHARED : MAP_PRIVATE) | + (is_src ? 0 : MAP_NORESERVE), + mem_fd, offset); + if (*alloc_area == MAP_FAILED) + err("mmap of hugetlbfs file failed"); + + if (map_shared) { + area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED, mem_fd, offset); + if (area_alias == MAP_FAILED) + err("mmap of hugetlb file alias failed"); + } + + if (is_src) { + alloc_area_alias = &area_src_alias; + } else { + alloc_area_alias = &area_dst_alias; + } + if (area_alias) + *alloc_area_alias = area_alias; +} + +static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ + if (!map_shared) + return; + + *start = (unsigned long) area_dst_alias + offset; +} + +static void shmem_release_pages(char *rel_area) +{ + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) + err("madvise(MADV_REMOVE) failed"); +} + +static void shmem_allocate_area(void **alloc_area, bool is_src) +{ + void *area_alias = NULL; + size_t bytes = nr_pages * page_size; + unsigned long offset = is_src ? 0 : bytes; + char *p = NULL, *p_alias = NULL; + + if (test_collapse) { + p = BASE_PMD_ADDR; + if (!is_src) + /* src map + alias + interleaved hpages */ + p += 2 * (bytes + hpage_size); + p_alias = p; + p_alias += bytes; + p_alias += hpage_size; /* Prevent src/dst VMA merge */ + } + + *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + mem_fd, offset); + if (*alloc_area == MAP_FAILED) + err("mmap of memfd failed"); + if (test_collapse && *alloc_area != p) + err("mmap of memfd failed at %p", p); + + area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + mem_fd, offset); + if (area_alias == MAP_FAILED) + err("mmap of memfd alias failed"); + if (test_collapse && area_alias != p_alias) + err("mmap of anonymous memory failed at %p", p_alias); + + if (is_src) + area_src_alias = area_alias; + else + area_dst_alias = area_alias; +} + +static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ + *start = (unsigned long)area_dst_alias + offset; +} + +static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) +{ + if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) + err("Did not find expected %d number of hugepages", + expect_nr_hpages); +} + +struct uffd_test_ops { + void (*allocate_area)(void **alloc_area, bool is_src); + void (*release_pages)(char *rel_area); + void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); + void (*check_pmd_mapping)(void *p, int expect_nr_hpages); +}; + +static struct uffd_test_ops anon_uffd_test_ops = { + .allocate_area = anon_allocate_area, + .release_pages = anon_release_pages, + .alias_mapping = noop_alias_mapping, + .check_pmd_mapping = NULL, +}; + +static struct uffd_test_ops shmem_uffd_test_ops = { + .allocate_area = shmem_allocate_area, + .release_pages = shmem_release_pages, + .alias_mapping = shmem_alias_mapping, + .check_pmd_mapping = shmem_check_pmd_mapping, +}; + +static struct uffd_test_ops hugetlb_uffd_test_ops = { + .allocate_area = hugetlb_allocate_area, + .release_pages = hugetlb_release_pages, + .alias_mapping = hugetlb_alias_mapping, + .check_pmd_mapping = NULL, +}; + +static struct uffd_test_ops *uffd_test_ops; + +static inline uint64_t uffd_minor_feature(void) +{ + if (test_type == TEST_HUGETLB && map_shared) + return UFFD_FEATURE_MINOR_HUGETLBFS; + else if (test_type == TEST_SHMEM) + return UFFD_FEATURE_MINOR_SHMEM; + else + return 0; +} + +static uint64_t get_expected_ioctls(uint64_t mode) +{ + uint64_t ioctls = UFFD_API_RANGE_IOCTLS; + + if (test_type == TEST_HUGETLB) + ioctls &= ~(1 << _UFFDIO_ZEROPAGE); + + if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) + ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); + + if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) + ioctls &= ~(1 << _UFFDIO_CONTINUE); + + return ioctls; +} + +static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) +{ + uint64_t expected = get_expected_ioctls(mode); + uint64_t actual = ioctls & expected; + + if (actual != expected) { + err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, + expected, actual); + } +} + +static int __userfaultfd_open_dev(void) +{ + int fd, _uffd; + + fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); + if (fd < 0) + errexit(KSFT_SKIP, "opening /dev/userfaultfd failed"); + + _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS); + if (_uffd < 0) + errexit(errno == ENOTTY ? KSFT_SKIP : 1, + "creating userfaultfd failed"); + close(fd); + return _uffd; +} + +static void userfaultfd_open(uint64_t *features) +{ + struct uffdio_api uffdio_api; + + if (test_dev_userfaultfd) + uffd = __userfaultfd_open_dev(); + else { + uffd = syscall(__NR_userfaultfd, UFFD_FLAGS); + if (uffd < 0) + errexit(errno == ENOSYS ? KSFT_SKIP : 1, + "creating userfaultfd failed"); + } + uffd_flags = fcntl(uffd, F_GETFD, NULL); + + uffdio_api.api = UFFD_API; + uffdio_api.features = *features; + if (ioctl(uffd, UFFDIO_API, &uffdio_api)) + err("UFFDIO_API failed.\nPlease make sure to " + "run with either root or ptrace capability."); + if (uffdio_api.api != UFFD_API) + err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); + + *features = uffdio_api.features; +} + +static inline void munmap_area(void **area) +{ + if (*area) + if (munmap(*area, nr_pages * page_size)) + err("munmap"); + + *area = NULL; +} + +static void uffd_test_ctx_clear(void) +{ + size_t i; + + if (pipefd) { + for (i = 0; i < nr_cpus * 2; ++i) { + if (close(pipefd[i])) + err("close pipefd"); + } + free(pipefd); + pipefd = NULL; + } + + if (count_verify) { + free(count_verify); + count_verify = NULL; + } + + if (uffd != -1) { + if (close(uffd)) + err("close uffd"); + uffd = -1; + } + + munmap_area((void **)&area_src); + munmap_area((void **)&area_src_alias); + munmap_area((void **)&area_dst); + munmap_area((void **)&area_dst_alias); + munmap_area((void **)&area_remap); +} + +static void uffd_test_ctx_init(uint64_t features) +{ + unsigned long nr, cpu; + + uffd_test_ctx_clear(); + + uffd_test_ops->allocate_area((void **)&area_src, true); + uffd_test_ops->allocate_area((void **)&area_dst, false); + + userfaultfd_open(&features); + + count_verify = malloc(nr_pages * sizeof(unsigned long long)); + if (!count_verify) + err("count_verify"); + + for (nr = 0; nr < nr_pages; nr++) { + *area_mutex(area_src, nr) = + (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; + count_verify[nr] = *area_count(area_src, nr) = 1; + /* + * In the transition between 255 to 256, powerpc will + * read out of order in my_bcmp and see both bytes as + * zero, so leave a placeholder below always non-zero + * after the count, to avoid my_bcmp to trigger false + * positives. + */ + *(area_count(area_src, nr) + 1) = 1; + } + + /* + * After initialization of area_src, we must explicitly release pages + * for area_dst to make sure it's fully empty. Otherwise we could have + * some area_dst pages be errornously initialized with zero pages, + * hence we could hit memory corruption later in the test. + * + * One example is when THP is globally enabled, above allocate_area() + * calls could have the two areas merged into a single VMA (as they + * will have the same VMA flags so they're mergeable). When we + * initialize the area_src above, it's possible that some part of + * area_dst could have been faulted in via one huge THP that will be + * shared between area_src and area_dst. It could cause some of the + * area_dst won't be trapped by missing userfaults. + * + * This release_pages() will guarantee even if that happened, we'll + * proactively split the thp and drop any accidentally initialized + * pages within area_dst. + */ + uffd_test_ops->release_pages(area_dst); + + pipefd = malloc(sizeof(int) * nr_cpus * 2); + if (!pipefd) + err("pipefd"); + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) + err("pipe"); +} + +static int my_bcmp(char *str1, char *str2, size_t n) +{ + unsigned long i; + for (i = 0; i < n; i++) + if (str1[i] != str2[i]) + return 1; + return 0; +} + +static void wp_range(int ufd, __u64 start, __u64 len, bool wp) +{ + struct uffdio_writeprotect prms; + + /* Write protection page faults */ + prms.range.start = start; + prms.range.len = len; + /* Undo write-protect, do wakeup after that */ + prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; + + if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) + err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); +} + +static void continue_range(int ufd, __u64 start, __u64 len) +{ + struct uffdio_continue req; + int ret; + + req.range.start = start; + req.range.len = len; + req.mode = 0; + + if (ioctl(ufd, UFFDIO_CONTINUE, &req)) + err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, + (uint64_t)start); + + /* + * Error handling within the kernel for continue is subtly different + * from copy or zeropage, so it may be a source of bugs. Trigger an + * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. + */ + req.mapped = 0; + ret = ioctl(ufd, UFFDIO_CONTINUE, &req); + if (ret >= 0 || req.mapped != -EEXIST) + err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, + ret, (int64_t) req.mapped); +} + +static void *locking_thread(void *arg) +{ + unsigned long cpu = (unsigned long) arg; + unsigned long page_nr; + unsigned long long count; + + if (!(bounces & BOUNCE_RANDOM)) { + page_nr = -bounces; + if (!(bounces & BOUNCE_RACINGFAULTS)) + page_nr += cpu * nr_pages_per_cpu; + } + + while (!finished) { + if (bounces & BOUNCE_RANDOM) { + if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr)) + err("getrandom failed"); + } else + page_nr += 1; + page_nr %= nr_pages; + pthread_mutex_lock(area_mutex(area_dst, page_nr)); + count = *area_count(area_dst, page_nr); + if (count != count_verify[page_nr]) + err("page_nr %lu memory corruption %llu %llu", + page_nr, count, count_verify[page_nr]); + count++; + *area_count(area_dst, page_nr) = count_verify[page_nr] = count; + pthread_mutex_unlock(area_mutex(area_dst, page_nr)); + } + + return NULL; +} + +static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, + unsigned long offset) +{ + uffd_test_ops->alias_mapping(&uffdio_copy->dst, + uffdio_copy->len, + offset); + if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { + /* real retval in ufdio_copy.copy */ + if (uffdio_copy->copy != -EEXIST) + err("UFFDIO_COPY retry error: %"PRId64, + (int64_t)uffdio_copy->copy); + } else { + err("UFFDIO_COPY retry unexpected: %"PRId64, + (int64_t)uffdio_copy->copy); + } +} + +static void wake_range(int ufd, unsigned long addr, unsigned long len) +{ + struct uffdio_range uffdio_wake; + + uffdio_wake.start = addr; + uffdio_wake.len = len; + + if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) + fprintf(stderr, "error waking %lu\n", + addr), exit(1); +} + +static int __copy_page(int ufd, unsigned long offset, bool retry) +{ + struct uffdio_copy uffdio_copy; + + if (offset >= nr_pages * page_size) + err("unexpected offset %lu\n", offset); + uffdio_copy.dst = (unsigned long) area_dst + offset; + uffdio_copy.src = (unsigned long) area_src + offset; + uffdio_copy.len = page_size; + if (test_uffdio_wp) + uffdio_copy.mode = UFFDIO_COPY_MODE_WP; + else + uffdio_copy.mode = 0; + uffdio_copy.copy = 0; + if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { + /* real retval in ufdio_copy.copy */ + if (uffdio_copy.copy != -EEXIST) + err("UFFDIO_COPY error: %"PRId64, + (int64_t)uffdio_copy.copy); + wake_range(ufd, uffdio_copy.dst, page_size); + } else if (uffdio_copy.copy != page_size) { + err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); + } else { + if (test_uffdio_copy_eexist && retry) { + test_uffdio_copy_eexist = false; + retry_copy_page(ufd, &uffdio_copy, offset); + } + return 1; + } + return 0; +} + +static int copy_page_retry(int ufd, unsigned long offset) +{ + return __copy_page(ufd, offset, true); +} + +static int copy_page(int ufd, unsigned long offset) +{ + return __copy_page(ufd, offset, false); +} + +static int uffd_read_msg(int ufd, struct uffd_msg *msg) +{ + int ret = read(uffd, msg, sizeof(*msg)); + + if (ret != sizeof(*msg)) { + if (ret < 0) { + if (errno == EAGAIN || errno == EINTR) + return 1; + err("blocking read error"); + } else { + err("short read"); + } + } + + return 0; +} + +static void uffd_handle_page_fault(struct uffd_msg *msg, + struct uffd_stats *stats) +{ + unsigned long offset; + + if (msg->event != UFFD_EVENT_PAGEFAULT) + err("unexpected msg event %u", msg->event); + + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { + /* Write protect page faults */ + wp_range(uffd, msg->arg.pagefault.address, page_size, false); + stats->wp_faults++; + } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { + uint8_t *area; + int b; + + /* + * Minor page faults + * + * To prove we can modify the original range for testing + * purposes, we're going to bit flip this range before + * continuing. + * + * Note that this requires all minor page fault tests operate on + * area_dst (non-UFFD-registered) and area_dst_alias + * (UFFD-registered). + */ + + area = (uint8_t *)(area_dst + + ((char *)msg->arg.pagefault.address - + area_dst_alias)); + for (b = 0; b < page_size; ++b) + area[b] = ~area[b]; + continue_range(uffd, msg->arg.pagefault.address, page_size); + stats->minor_faults++; + } else { + /* + * Missing page faults. + * + * Here we force a write check for each of the missing mode + * faults. It's guaranteed because the only threads that + * will trigger uffd faults are the locking threads, and + * their first instruction to touch the missing page will + * always be pthread_mutex_lock(). + * + * Note that here we relied on an NPTL glibc impl detail to + * always read the lock type at the entry of the lock op + * (pthread_mutex_t.__data.__type, offset 0x10) before + * doing any locking operations to guarantee that. It's + * actually not good to rely on this impl detail because + * logically a pthread-compatible lib can implement the + * locks without types and we can fail when linking with + * them. However since we used to find bugs with this + * strict check we still keep it around. Hopefully this + * could be a good hint when it fails again. If one day + * it'll break on some other impl of glibc we'll revisit. + */ + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) + err("unexpected write fault"); + + offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; + offset &= ~(page_size-1); + + if (copy_page(uffd, offset)) + stats->missing_faults++; + } +} + +static void *uffd_poll_thread(void *arg) +{ + struct uffd_stats *stats = (struct uffd_stats *)arg; + unsigned long cpu = stats->cpu; + struct pollfd pollfd[2]; + struct uffd_msg msg; + struct uffdio_register uffd_reg; + int ret; + char tmp_chr; + + pollfd[0].fd = uffd; + pollfd[0].events = POLLIN; + pollfd[1].fd = pipefd[cpu*2]; + pollfd[1].events = POLLIN; + + for (;;) { + ret = poll(pollfd, 2, -1); + if (ret <= 0) { + if (errno == EINTR || errno == EAGAIN) + continue; + err("poll error: %d", ret); + } + if (pollfd[1].revents & POLLIN) { + if (read(pollfd[1].fd, &tmp_chr, 1) != 1) + err("read pipefd error"); + break; + } + if (!(pollfd[0].revents & POLLIN)) + err("pollfd[0].revents %d", pollfd[0].revents); + if (uffd_read_msg(uffd, &msg)) + continue; + switch (msg.event) { + default: + err("unexpected msg event %u\n", msg.event); + break; + case UFFD_EVENT_PAGEFAULT: + uffd_handle_page_fault(&msg, stats); + break; + case UFFD_EVENT_FORK: + close(uffd); + uffd = msg.arg.fork.ufd; + pollfd[0].fd = uffd; + break; + case UFFD_EVENT_REMOVE: + uffd_reg.range.start = msg.arg.remove.start; + uffd_reg.range.len = msg.arg.remove.end - + msg.arg.remove.start; + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) + err("remove failure"); + break; + case UFFD_EVENT_REMAP: + area_remap = area_dst; /* save for later unmap */ + area_dst = (char *)(unsigned long)msg.arg.remap.to; + break; + } + } + + return NULL; +} + +pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; + +static void *uffd_read_thread(void *arg) +{ + struct uffd_stats *stats = (struct uffd_stats *)arg; + struct uffd_msg msg; + + pthread_mutex_unlock(&uffd_read_mutex); + /* from here cancellation is ok */ + + for (;;) { + if (uffd_read_msg(uffd, &msg)) + continue; + uffd_handle_page_fault(&msg, stats); + } + + return NULL; +} + +static void *background_thread(void *arg) +{ + unsigned long cpu = (unsigned long) arg; + unsigned long page_nr, start_nr, mid_nr, end_nr; + + start_nr = cpu * nr_pages_per_cpu; + end_nr = (cpu+1) * nr_pages_per_cpu; + mid_nr = (start_nr + end_nr) / 2; + + /* Copy the first half of the pages */ + for (page_nr = start_nr; page_nr < mid_nr; page_nr++) + copy_page_retry(uffd, page_nr * page_size); + + /* + * If we need to test uffd-wp, set it up now. Then we'll have + * at least the first half of the pages mapped already which + * can be write-protected for testing + */ + if (test_uffdio_wp) + wp_range(uffd, (unsigned long)area_dst + start_nr * page_size, + nr_pages_per_cpu * page_size, true); + + /* + * Continue the 2nd half of the page copying, handling write + * protection faults if any + */ + for (page_nr = mid_nr; page_nr < end_nr; page_nr++) + copy_page_retry(uffd, page_nr * page_size); + + return NULL; +} + +static int stress(struct uffd_stats *uffd_stats) +{ + unsigned long cpu; + pthread_t locking_threads[nr_cpus]; + pthread_t uffd_threads[nr_cpus]; + pthread_t background_threads[nr_cpus]; + + finished = 0; + for (cpu = 0; cpu < nr_cpus; cpu++) { + if (pthread_create(&locking_threads[cpu], &attr, + locking_thread, (void *)cpu)) + return 1; + if (bounces & BOUNCE_POLL) { + if (pthread_create(&uffd_threads[cpu], &attr, + uffd_poll_thread, + (void *)&uffd_stats[cpu])) + return 1; + } else { + if (pthread_create(&uffd_threads[cpu], &attr, + uffd_read_thread, + (void *)&uffd_stats[cpu])) + return 1; + pthread_mutex_lock(&uffd_read_mutex); + } + if (pthread_create(&background_threads[cpu], &attr, + background_thread, (void *)cpu)) + return 1; + } + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pthread_join(background_threads[cpu], NULL)) + return 1; + + /* + * Be strict and immediately zap area_src, the whole area has + * been transferred already by the background treads. The + * area_src could then be faulted in a racy way by still + * running uffdio_threads reading zeropages after we zapped + * area_src (but they're guaranteed to get -EEXIST from + * UFFDIO_COPY without writing zero pages into area_dst + * because the background threads already completed). + */ + uffd_test_ops->release_pages(area_src); + + finished = 1; + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pthread_join(locking_threads[cpu], NULL)) + return 1; + + for (cpu = 0; cpu < nr_cpus; cpu++) { + char c; + if (bounces & BOUNCE_POLL) { + if (write(pipefd[cpu*2+1], &c, 1) != 1) + err("pipefd write error"); + if (pthread_join(uffd_threads[cpu], + (void *)&uffd_stats[cpu])) + return 1; + } else { + if (pthread_cancel(uffd_threads[cpu])) + return 1; + if (pthread_join(uffd_threads[cpu], NULL)) + return 1; + } + } + + return 0; +} + +sigjmp_buf jbuf, *sigbuf; + +static void sighndl(int sig, siginfo_t *siginfo, void *ptr) +{ + if (sig == SIGBUS) { + if (sigbuf) + siglongjmp(*sigbuf, 1); + abort(); + } +} + +/* + * For non-cooperative userfaultfd test we fork() a process that will + * generate pagefaults, will mremap the area monitored by the + * userfaultfd and at last this process will release the monitored + * area. + * For the anonymous and shared memory the area is divided into two + * parts, the first part is accessed before mremap, and the second + * part is accessed after mremap. Since hugetlbfs does not support + * mremap, the entire monitored area is accessed in a single pass for + * HUGETLB_TEST. + * The release of the pages currently generates event for shmem and + * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked + * for hugetlb. + * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register + * monitored area, generate pagefaults and test that signal is delivered. + * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2 + * test robustness use case - we release monitored area, fork a process + * that will generate pagefaults and verify signal is generated. + * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal + * feature. Using monitor thread, verify no userfault events are generated. + */ +static int faulting_process(int signal_test) +{ + unsigned long nr; + unsigned long long count; + unsigned long split_nr_pages; + unsigned long lastnr; + struct sigaction act; + volatile unsigned long signalled = 0; + + split_nr_pages = (nr_pages + 1) / 2; + + if (signal_test) { + sigbuf = &jbuf; + memset(&act, 0, sizeof(act)); + act.sa_sigaction = sighndl; + act.sa_flags = SA_SIGINFO; + if (sigaction(SIGBUS, &act, 0)) + err("sigaction"); + lastnr = (unsigned long)-1; + } + + for (nr = 0; nr < split_nr_pages; nr++) { + volatile int steps = 1; + unsigned long offset = nr * page_size; + + if (signal_test) { + if (sigsetjmp(*sigbuf, 1) != 0) { + if (steps == 1 && nr == lastnr) + err("Signal repeated"); + + lastnr = nr; + if (signal_test == 1) { + if (steps == 1) { + /* This is a MISSING request */ + steps++; + if (copy_page(uffd, offset)) + signalled++; + } else { + /* This is a WP request */ + assert(steps == 2); + wp_range(uffd, + (__u64)area_dst + + offset, + page_size, false); + } + } else { + signalled++; + continue; + } + } + } + + count = *area_count(area_dst, nr); + if (count != count_verify[nr]) + err("nr %lu memory corruption %llu %llu\n", + nr, count, count_verify[nr]); + /* + * Trigger write protection if there is by writing + * the same value back. + */ + *area_count(area_dst, nr) = count; + } + + if (signal_test) + return signalled != split_nr_pages; + + area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, area_src); + if (area_dst == MAP_FAILED) + err("mremap"); + /* Reset area_src since we just clobbered it */ + area_src = NULL; + + for (; nr < nr_pages; nr++) { + count = *area_count(area_dst, nr); + if (count != count_verify[nr]) { + err("nr %lu memory corruption %llu %llu\n", + nr, count, count_verify[nr]); + } + /* + * Trigger write protection if there is by writing + * the same value back. + */ + *area_count(area_dst, nr) = count; + } + + uffd_test_ops->release_pages(area_dst); + + for (nr = 0; nr < nr_pages; nr++) + if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) + err("nr %lu is not zero", nr); + + return 0; +} + +static void retry_uffdio_zeropage(int ufd, + struct uffdio_zeropage *uffdio_zeropage, + unsigned long offset) +{ + uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start, + uffdio_zeropage->range.len, + offset); + if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { + if (uffdio_zeropage->zeropage != -EEXIST) + err("UFFDIO_ZEROPAGE error: %"PRId64, + (int64_t)uffdio_zeropage->zeropage); + } else { + err("UFFDIO_ZEROPAGE error: %"PRId64, + (int64_t)uffdio_zeropage->zeropage); + } +} + +static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) +{ + struct uffdio_zeropage uffdio_zeropage; + int ret; + bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE); + __s64 res; + + if (offset >= nr_pages * page_size) + err("unexpected offset %lu", offset); + uffdio_zeropage.range.start = (unsigned long) area_dst + offset; + uffdio_zeropage.range.len = page_size; + uffdio_zeropage.mode = 0; + ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); + res = uffdio_zeropage.zeropage; + if (ret) { + /* real retval in ufdio_zeropage.zeropage */ + if (has_zeropage) + err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res); + else if (res != -EINVAL) + err("UFFDIO_ZEROPAGE not -EINVAL"); + } else if (has_zeropage) { + if (res != page_size) { + err("UFFDIO_ZEROPAGE unexpected size"); + } else { + if (test_uffdio_zeropage_eexist && retry) { + test_uffdio_zeropage_eexist = false; + retry_uffdio_zeropage(ufd, &uffdio_zeropage, + offset); + } + return 1; + } + } else + err("UFFDIO_ZEROPAGE succeeded"); + + return 0; +} + +static int uffdio_zeropage(int ufd, unsigned long offset) +{ + return __uffdio_zeropage(ufd, offset, false); +} + +/* exercise UFFDIO_ZEROPAGE */ +static int userfaultfd_zeropage_test(void) +{ + struct uffdio_register uffdio_register; + + printf("testing UFFDIO_ZEROPAGE: "); + fflush(stdout); + + uffd_test_ctx_init(0); + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + if (uffdio_zeropage(uffd, 0)) + if (my_bcmp(area_dst, zeropage, page_size)) + err("zeropage is not zero"); + + printf("done.\n"); + return 0; +} + +static int userfaultfd_events_test(void) +{ + struct uffdio_register uffdio_register; + pthread_t uffd_mon; + int err, features; + pid_t pid; + char c; + struct uffd_stats stats = { 0 }; + + printf("testing events (fork, remap, remove): "); + fflush(stdout); + + features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | + UFFD_FEATURE_EVENT_REMOVE; + uffd_test_ctx_init(features); + + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) + err("uffd_poll_thread create"); + + pid = fork(); + if (pid < 0) + err("fork"); + + if (!pid) + exit(faulting_process(0)); + + waitpid(pid, &err, 0); + if (err) + err("faulting process failed"); + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, NULL)) + return 1; + + uffd_stats_report(&stats, 1); + + return stats.missing_faults != nr_pages; +} + +static int userfaultfd_sig_test(void) +{ + struct uffdio_register uffdio_register; + unsigned long userfaults; + pthread_t uffd_mon; + int err, features; + pid_t pid; + char c; + struct uffd_stats stats = { 0 }; + + printf("testing signal delivery: "); + fflush(stdout); + + features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS; + uffd_test_ctx_init(features); + + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + if (faulting_process(1)) + err("faulting process failed"); + + uffd_test_ops->release_pages(area_dst); + + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) + err("uffd_poll_thread create"); + + pid = fork(); + if (pid < 0) + err("fork"); + + if (!pid) + exit(faulting_process(2)); + + waitpid(pid, &err, 0); + if (err) + err("faulting process failed"); + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, (void **)&userfaults)) + return 1; + + printf("done.\n"); + if (userfaults) + err("Signal test failed, userfaults: %ld", userfaults); + + return userfaults != 0; +} + +void check_memory_contents(char *p) +{ + unsigned long i; + uint8_t expected_byte; + void *expected_page; + + if (posix_memalign(&expected_page, page_size, page_size)) + err("out of memory"); + + for (i = 0; i < nr_pages; ++i) { + expected_byte = ~((uint8_t)(i % ((uint8_t)-1))); + memset(expected_page, expected_byte, page_size); + if (my_bcmp(expected_page, p + (i * page_size), page_size)) + err("unexpected page contents after minor fault"); + } + + free(expected_page); +} + +static int userfaultfd_minor_test(void) +{ + unsigned long p; + struct uffdio_register uffdio_register; + pthread_t uffd_mon; + char c; + struct uffd_stats stats = { 0 }; + + if (!test_uffdio_minor) + return 0; + + printf("testing minor faults: "); + fflush(stdout); + + uffd_test_ctx_init(uffd_minor_feature()); + + uffdio_register.range.start = (unsigned long)area_dst_alias; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + /* + * After registering with UFFD, populate the non-UFFD-registered side of + * the shared mapping. This should *not* trigger any UFFD minor faults. + */ + for (p = 0; p < nr_pages; ++p) { + memset(area_dst + (p * page_size), p % ((uint8_t)-1), + page_size); + } + + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) + err("uffd_poll_thread create"); + + /* + * Read each of the pages back using the UFFD-registered mapping. We + * expect that the first time we touch a page, it will result in a minor + * fault. uffd_poll_thread will resolve the fault by bit-flipping the + * page's contents, and then issuing a CONTINUE ioctl. + */ + check_memory_contents(area_dst_alias); + + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, NULL)) + return 1; + + uffd_stats_report(&stats, 1); + + if (test_collapse) { + printf("testing collapse of uffd memory into PMD-mapped THPs:"); + if (madvise(area_dst_alias, nr_pages * page_size, + MADV_COLLAPSE)) + err("madvise(MADV_COLLAPSE)"); + + uffd_test_ops->check_pmd_mapping(area_dst, + nr_pages * page_size / + hpage_size); + /* + * This won't cause uffd-fault - it purely just makes sure there + * was no corruption. + */ + check_memory_contents(area_dst_alias); + printf(" done.\n"); + } + + return stats.missing_faults != 0 || stats.minor_faults != nr_pages; +} + +#define BIT_ULL(nr) (1ULL << (nr)) +#define PM_SOFT_DIRTY BIT_ULL(55) +#define PM_MMAP_EXCLUSIVE BIT_ULL(56) +#define PM_UFFD_WP BIT_ULL(57) +#define PM_FILE BIT_ULL(61) +#define PM_SWAP BIT_ULL(62) +#define PM_PRESENT BIT_ULL(63) + +static int pagemap_open(void) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + + if (fd < 0) + err("open pagemap"); + + return fd; +} + +static uint64_t pagemap_read_vaddr(int fd, void *vaddr) +{ + uint64_t value; + int ret; + + ret = pread(fd, &value, sizeof(uint64_t), + ((uint64_t)vaddr >> 12) * sizeof(uint64_t)); + if (ret != sizeof(uint64_t)) + err("pread() on pagemap failed"); + + return value; +} + +/* This macro let __LINE__ works in err() */ +#define pagemap_check_wp(value, wp) do { \ + if (!!(value & PM_UFFD_WP) != wp) \ + err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \ + } while (0) + +static int pagemap_test_fork(bool present) +{ + pid_t child = fork(); + uint64_t value; + int fd, result; + + if (!child) { + /* Open the pagemap fd of the child itself */ + fd = pagemap_open(); + value = pagemap_read_vaddr(fd, area_dst); + /* + * After fork() uffd-wp bit should be gone as long as we're + * without UFFD_FEATURE_EVENT_FORK + */ + pagemap_check_wp(value, false); + /* Succeed */ + exit(0); + } + waitpid(child, &result, 0); + return result; +} + +static void userfaultfd_pagemap_test(unsigned int test_pgsize) +{ + struct uffdio_register uffdio_register; + int pagemap_fd; + uint64_t value; + + /* Pagemap tests uffd-wp only */ + if (!test_uffdio_wp) + return; + + /* Not enough memory to test this page size */ + if (test_pgsize > nr_pages * page_size) + return; + + printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize); + /* Flush so it doesn't flush twice in parent/child later */ + fflush(stdout); + + uffd_test_ctx_init(0); + + if (test_pgsize > page_size) { + /* This is a thp test */ + if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) + err("madvise(MADV_HUGEPAGE) failed"); + } else if (test_pgsize == page_size) { + /* This is normal page test; force no thp */ + if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) + err("madvise(MADV_NOHUGEPAGE) failed"); + } + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failed"); + + pagemap_fd = pagemap_open(); + + /* Touch the page */ + *area_dst = 1; + wp_range(uffd, (uint64_t)area_dst, test_pgsize, true); + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, true); + /* Make sure uffd-wp bit dropped when fork */ + if (pagemap_test_fork(true)) + err("Detected stall uffd-wp bit in child"); + + /* Exclusive required or PAGEOUT won't work */ + if (!(value & PM_MMAP_EXCLUSIVE)) + err("multiple mapping detected: 0x%"PRIx64, value); + + if (madvise(area_dst, test_pgsize, MADV_PAGEOUT)) + err("madvise(MADV_PAGEOUT) failed"); + + /* Uffd-wp should persist even swapped out */ + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, true); + /* Make sure uffd-wp bit dropped when fork */ + if (pagemap_test_fork(false)) + err("Detected stall uffd-wp bit in child"); + + /* Unprotect; this tests swap pte modifications */ + wp_range(uffd, (uint64_t)area_dst, page_size, false); + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + /* Fault in the page from disk */ + *area_dst = 2; + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + close(pagemap_fd); + printf("done\n"); +} + +static int userfaultfd_stress(void) +{ + void *area; + unsigned long nr; + struct uffdio_register uffdio_register; + struct uffd_stats uffd_stats[nr_cpus]; + + uffd_test_ctx_init(0); + + if (posix_memalign(&area, page_size, page_size)) + err("out of memory"); + zeropage = area; + bzero(zeropage, page_size); + + pthread_mutex_lock(&uffd_read_mutex); + + pthread_attr_init(&attr); + pthread_attr_setstacksize(&attr, 16*1024*1024); + + while (bounces--) { + printf("bounces: %d, mode:", bounces); + if (bounces & BOUNCE_RANDOM) + printf(" rnd"); + if (bounces & BOUNCE_RACINGFAULTS) + printf(" racing"); + if (bounces & BOUNCE_VERIFY) + printf(" ver"); + if (bounces & BOUNCE_POLL) + printf(" poll"); + else + printf(" read"); + printf(", "); + fflush(stdout); + + if (bounces & BOUNCE_POLL) + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + else + fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); + + /* register */ + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + if (area_dst_alias) { + uffdio_register.range.start = (unsigned long) + area_dst_alias; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure alias"); + } + + /* + * The madvise done previously isn't enough: some + * uffd_thread could have read userfaults (one of + * those already resolved by the background thread) + * and it may be in the process of calling + * UFFDIO_COPY. UFFDIO_COPY will read the zapped + * area_src and it would map a zero page in it (of + * course such a UFFDIO_COPY is perfectly safe as it'd + * return -EEXIST). The problem comes at the next + * bounce though: that racing UFFDIO_COPY would + * generate zeropages in the area_src, so invalidating + * the previous MADV_DONTNEED. Without this additional + * MADV_DONTNEED those zeropages leftovers in the + * area_src would lead to -EEXIST failure during the + * next bounce, effectively leaving a zeropage in the + * area_dst. + * + * Try to comment this out madvise to see the memory + * corruption being caught pretty quick. + * + * khugepaged is also inhibited to collapse THP after + * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's + * required to MADV_DONTNEED here. + */ + uffd_test_ops->release_pages(area_dst); + + uffd_stats_reset(uffd_stats, nr_cpus); + + /* bounce pass */ + if (stress(uffd_stats)) + return 1; + + /* Clear all the write protections if there is any */ + if (test_uffdio_wp) + wp_range(uffd, (unsigned long)area_dst, + nr_pages * page_size, false); + + /* unregister */ + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) + err("unregister failure"); + if (area_dst_alias) { + uffdio_register.range.start = (unsigned long) area_dst; + if (ioctl(uffd, UFFDIO_UNREGISTER, + &uffdio_register.range)) + err("unregister failure alias"); + } + + /* verification */ + if (bounces & BOUNCE_VERIFY) + for (nr = 0; nr < nr_pages; nr++) + if (*area_count(area_dst, nr) != count_verify[nr]) + err("error area_count %llu %llu %lu\n", + *area_count(area_src, nr), + count_verify[nr], nr); + + /* prepare next bounce */ + swap(area_src, area_dst); + + swap(area_src_alias, area_dst_alias); + + uffd_stats_report(uffd_stats, nr_cpus); + } + + if (test_type == TEST_ANON) { + /* + * shmem/hugetlb won't be able to run since they have different + * behavior on fork() (file-backed memory normally drops ptes + * directly when fork), meanwhile the pagemap test will verify + * pgtable entry of fork()ed child. + */ + userfaultfd_pagemap_test(page_size); + /* + * Hard-code for x86_64 for now for 2M THP, as x86_64 is + * currently the only one that supports uffd-wp + */ + userfaultfd_pagemap_test(page_size * 512); + } + + return userfaultfd_zeropage_test() || userfaultfd_sig_test() + || userfaultfd_events_test() || userfaultfd_minor_test(); +} + +/* + * Copied from mlock2-tests.c + */ +unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + + free(line); + fclose(f); + return hps; +} + +static void set_test_type(const char *type) +{ + if (!strcmp(type, "anon")) { + test_type = TEST_ANON; + uffd_test_ops = &anon_uffd_test_ops; + } else if (!strcmp(type, "hugetlb")) { + test_type = TEST_HUGETLB; + uffd_test_ops = &hugetlb_uffd_test_ops; + } else if (!strcmp(type, "hugetlb_shared")) { + map_shared = true; + test_type = TEST_HUGETLB; + uffd_test_ops = &hugetlb_uffd_test_ops; + /* Minor faults require shared hugetlb; only enable here. */ + test_uffdio_minor = true; + } else if (!strcmp(type, "shmem")) { + map_shared = true; + test_type = TEST_SHMEM; + uffd_test_ops = &shmem_uffd_test_ops; + test_uffdio_minor = true; + } +} + +static void parse_test_type_arg(const char *raw_type) +{ + char *buf = strdup(raw_type); + uint64_t features = UFFD_API_FEATURES; + + while (buf) { + const char *token = strsep(&buf, ":"); + + if (!test_type) + set_test_type(token); + else if (!strcmp(token, "dev")) + test_dev_userfaultfd = true; + else if (!strcmp(token, "syscall")) + test_dev_userfaultfd = false; + else if (!strcmp(token, "collapse")) + test_collapse = true; + else + err("unrecognized test mod '%s'", token); + } + + if (!test_type) + err("failed to parse test type argument: '%s'", raw_type); + + if (test_collapse && test_type != TEST_SHMEM) + err("Unsupported test: %s", raw_type); + + if (test_type == TEST_HUGETLB) + page_size = hpage_size; + else + page_size = sysconf(_SC_PAGE_SIZE); + + if (!page_size) + err("Unable to determine page size"); + if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 + > page_size) + err("Impossible to run this test"); + + /* + * Whether we can test certain features depends not just on test type, + * but also on whether or not this particular kernel supports the + * feature. + */ + + userfaultfd_open(&features); + + test_uffdio_wp = test_uffdio_wp && + (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); + test_uffdio_minor = test_uffdio_minor && + (features & uffd_minor_feature()); + + close(uffd); + uffd = -1; +} + +static void sigalrm(int sig) +{ + if (sig != SIGALRM) + abort(); + test_uffdio_copy_eexist = true; + test_uffdio_zeropage_eexist = true; + alarm(ALARM_INTERVAL_SECS); +} + +int main(int argc, char **argv) +{ + size_t bytes; + + if (argc < 4) + usage(); + + if (signal(SIGALRM, sigalrm) == SIG_ERR) + err("failed to arm SIGALRM"); + alarm(ALARM_INTERVAL_SECS); + + hpage_size = default_huge_page_size(); + parse_test_type_arg(argv[1]); + bytes = atol(argv[2]) * 1024 * 1024; + + if (test_collapse && bytes & (hpage_size - 1)) + err("MiB must be multiple of %lu if :collapse mod set", + hpage_size >> 20); + + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + + if (test_collapse) { + /* nr_cpus must divide (bytes / page_size), otherwise, + * area allocations of (nr_pages * paze_size) won't be a + * multiple of hpage_size, even if bytes is a multiple of + * hpage_size. + * + * This means that nr_cpus must divide (N * (2 << (H-P)) + * where: + * bytes = hpage_size * N + * hpage_size = 2 << H + * page_size = 2 << P + * + * And we want to chose nr_cpus to be the largest value + * satisfying this constraint, not larger than the number + * of online CPUs. Unfortunately, prime factorization of + * N and nr_cpus may be arbitrary, so have to search for it. + * Instead, just use the highest power of 2 dividing both + * nr_cpus and (bytes / page_size). + */ + int x = factor_of_2(nr_cpus); + int y = factor_of_2(bytes / page_size); + + nr_cpus = x < y ? x : y; + } + nr_pages_per_cpu = bytes / page_size / nr_cpus; + if (!nr_pages_per_cpu) { + _err("invalid MiB"); + usage(); + } + + bounces = atoi(argv[3]); + if (bounces <= 0) { + _err("invalid bounces"); + usage(); + } + nr_pages = nr_pages_per_cpu * nr_cpus; + + if (test_type == TEST_SHMEM || test_type == TEST_HUGETLB) { + unsigned int memfd_flags = 0; + + if (test_type == TEST_HUGETLB) + memfd_flags = MFD_HUGETLB; + mem_fd = memfd_create(argv[0], memfd_flags); + if (mem_fd < 0) + err("memfd_create"); + if (ftruncate(mem_fd, nr_pages * page_size * 2)) + err("ftruncate"); + if (fallocate(mem_fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, + nr_pages * page_size * 2)) + err("fallocate"); + } + printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", + nr_pages, nr_pages_per_cpu); + return userfaultfd_stress(); +} + +#else /* __NR_userfaultfd */ + +#warning "missing __NR_userfaultfd definition" + +int main(void) +{ + printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); + return KSFT_SKIP; +} + +#endif /* __NR_userfaultfd */ diff --git a/tools/testing/selftests/mm/util.h b/tools/testing/selftests/mm/util.h new file mode 100644 index 000000000000..b27d26199334 --- /dev/null +++ b/tools/testing/selftests/mm/util.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __KSELFTEST_VM_UTIL_H +#define __KSELFTEST_VM_UTIL_H + +#include +#include +#include +#include /* ffsl() */ +#include /* _SC_PAGESIZE */ + +static unsigned int __page_size; +static unsigned int __page_shift; + +static inline unsigned int page_size(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned int page_shift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(page_size()) - 1); + return __page_shift; +} + +#define PAGE_SHIFT (page_shift()) +#define PAGE_SIZE (page_size()) +/* + * On ppc64 this will only work with radix 2M hugepage size + */ +#define HPAGE_SHIFT 21 +#define HPAGE_SIZE (1 << HPAGE_SHIFT) + +#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0) +#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) + + +static inline int64_t allocate_transhuge(void *ptr, int pagemap_fd) +{ + uint64_t ent[2]; + + /* drop pmd */ + if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANONYMOUS | + MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr) + errx(2, "mmap transhuge"); + + if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE)) + err(2, "MADV_HUGEPAGE"); + + /* allocate transparent huge page */ + *(volatile void **)ptr = ptr; + + if (pread(pagemap_fd, ent, sizeof(ent), + (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent)) + err(2, "read pagemap"); + + if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) && + PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) && + !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1))) + return PAGEMAP_PFN(ent[0]); + + return -1; +} + +#endif diff --git a/tools/testing/selftests/mm/va_128TBswitch.c b/tools/testing/selftests/mm/va_128TBswitch.c new file mode 100644 index 000000000000..1d2068989883 --- /dev/null +++ b/tools/testing/selftests/mm/va_128TBswitch.c @@ -0,0 +1,289 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * + * Authors: Kirill A. Shutemov + * Authors: Aneesh Kumar K.V + */ + +#include +#include +#include + +#include "../kselftest.h" + +#ifdef __powerpc64__ +#define PAGE_SIZE (64 << 10) +/* + * This will work with 16M and 2M hugepage size + */ +#define HUGETLB_SIZE (16 << 20) +#else +#define PAGE_SIZE (4 << 10) +#define HUGETLB_SIZE (2 << 20) +#endif + +/* + * >= 128TB is the hint addr value we used to select + * large address space. + */ +#define ADDR_SWITCH_HINT (1UL << 47) +#define LOW_ADDR ((void *) (1UL << 30)) +#define HIGH_ADDR ((void *) (1UL << 48)) + +struct testcase { + void *addr; + unsigned long size; + unsigned long flags; + const char *msg; + unsigned int low_addr_required:1; + unsigned int keep_mapped:1; +}; + +static struct testcase testcases[] = { + { + /* + * If stack is moved, we could possibly allocate + * this at the requested address. + */ + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", + .low_addr_required = 1, + }, + { + /* + * We should never allocate at the requested address or above it + * The len cross the 128TB boundary. Without MAP_FIXED + * we will always search in the lower address space. + */ + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))", + .low_addr_required = 1, + }, + { + /* + * Exact mapping at 128TB, the area is free we should get that + * even without MAP_FIXED. + */ + .addr = ((void *)(ADDR_SWITCH_HINT)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", + }, + { + .addr = NULL, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(NULL)", + .low_addr_required = 1, + }, + { + .addr = LOW_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(LOW_ADDR)", + .low_addr_required = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR)", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR) again", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(HIGH_ADDR, MAP_FIXED)", + }, + { + .addr = (void *) -1, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1)", + .keep_mapped = 1, + }, + { + .addr = (void *) -1, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1) again", + }, + { + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", + .low_addr_required = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = ((void *)(ADDR_SWITCH_HINT)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", + }, +}; + +static struct testcase hugetlb_testcases[] = { + { + .addr = NULL, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(NULL, MAP_HUGETLB)", + .low_addr_required = 1, + }, + { + .addr = LOW_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(LOW_ADDR, MAP_HUGETLB)", + .low_addr_required = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)", + }, + { + .addr = (void *) -1, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1, MAP_HUGETLB)", + .keep_mapped = 1, + }, + { + .addr = (void *) -1, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1, MAP_HUGETLB) again", + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), + .size = 2 * HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)", + }, +}; + +static int run_test(struct testcase *test, int count) +{ + void *p; + int i, ret = KSFT_PASS; + + for (i = 0; i < count; i++) { + struct testcase *t = test + i; + + p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0); + + printf("%s: %p - ", t->msg, p); + + if (p == MAP_FAILED) { + printf("FAILED\n"); + ret = KSFT_FAIL; + continue; + } + + if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) { + printf("FAILED\n"); + ret = KSFT_FAIL; + } else { + /* + * Do a dereference of the address returned so that we catch + * bugs in page fault handling + */ + memset(p, 0, t->size); + printf("OK\n"); + } + if (!t->keep_mapped) + munmap(p, t->size); + } + + return ret; +} + +static int supported_arch(void) +{ +#if defined(__powerpc64__) + return 1; +#elif defined(__x86_64__) + return 1; +#else + return 0; +#endif +} + +int main(int argc, char **argv) +{ + int ret; + + if (!supported_arch()) + return KSFT_SKIP; + + ret = run_test(testcases, ARRAY_SIZE(testcases)); + if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) + ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases)); + return ret; +} diff --git a/tools/testing/selftests/mm/va_128TBswitch.sh b/tools/testing/selftests/mm/va_128TBswitch.sh new file mode 100644 index 000000000000..41580751dc51 --- /dev/null +++ b/tools/testing/selftests/mm/va_128TBswitch.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2022 Adam Sindelar (Meta) +# +# This is a test for mmap behavior with 5-level paging. This script wraps the +# real test to check that the kernel is configured to support at least 5 +# pagetable levels. + +# 1 means the test failed +exitcode=1 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +fail() +{ + echo "$1" + exit $exitcode +} + +check_supported_x86_64() +{ + local config="/proc/config.gz" + [[ -f "${config}" ]] || config="/boot/config-$(uname -r)" + [[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot" + + # gzip -dcfq automatically handles both compressed and plaintext input. + # See man 1 gzip under '-f'. + local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2) + + if [[ "${pg_table_levels}" -lt 5 ]]; then + echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test" + exit $ksft_skip + fi +} + +check_test_requirements() +{ + # The test supports x86_64 and powerpc64. We currently have no useful + # eligibility check for powerpc64, and the test itself will reject other + # architectures. + case `uname -m` in + "x86_64") + check_supported_x86_64 + ;; + *) + return 0 + ;; + esac +} + +check_test_requirements +./va_128TBswitch diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c new file mode 100644 index 000000000000..c0592646ed93 --- /dev/null +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2017, Anshuman Khandual, IBM Corp. + * + * Works on architectures which support 128TB virtual + * address range and beyond. + */ +#include +#include +#include +#include +#include +#include +#include + +/* + * Maximum address range mapped with a single mmap() + * call is little bit more than 16GB. Hence 16GB is + * chosen as the single chunk size for address space + * mapping. + */ +#define MAP_CHUNK_SIZE 17179869184UL /* 16GB */ + +/* + * Address space till 128TB is mapped without any hint + * and is enabled by default. Address space beyond 128TB + * till 512TB is obtained by passing hint address as the + * first argument into mmap() system call. + * + * The process heap address space is divided into two + * different areas one below 128TB and one above 128TB + * till it reaches 512TB. One with size 128TB and the + * other being 384TB. + * + * On Arm64 the address space is 256TB and no high mappings + * are supported so far. + */ + +#define NR_CHUNKS_128TB 8192UL /* Number of 16GB chunks for 128TB */ +#define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL) +#define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL) + +#define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */ +#define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */ + +#ifdef __aarch64__ +#define HIGH_ADDR_MARK ADDR_MARK_256TB +#define HIGH_ADDR_SHIFT 49 +#define NR_CHUNKS_LOW NR_CHUNKS_256TB +#define NR_CHUNKS_HIGH 0 +#else +#define HIGH_ADDR_MARK ADDR_MARK_128TB +#define HIGH_ADDR_SHIFT 48 +#define NR_CHUNKS_LOW NR_CHUNKS_128TB +#define NR_CHUNKS_HIGH NR_CHUNKS_384TB +#endif + +static char *hind_addr(void) +{ + int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT); + + return (char *) (1UL << bits); +} + +static int validate_addr(char *ptr, int high_addr) +{ + unsigned long addr = (unsigned long) ptr; + + if (high_addr) { + if (addr < HIGH_ADDR_MARK) { + printf("Bad address %lx\n", addr); + return 1; + } + return 0; + } + + if (addr > HIGH_ADDR_MARK) { + printf("Bad address %lx\n", addr); + return 1; + } + return 0; +} + +static int validate_lower_address_hint(void) +{ + char *ptr; + + ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ | + PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (ptr == MAP_FAILED) + return 0; + + return 1; +} + +int main(int argc, char *argv[]) +{ + char *ptr[NR_CHUNKS_LOW]; + char *hptr[NR_CHUNKS_HIGH]; + char *hint; + unsigned long i, lchunks, hchunks; + + for (i = 0; i < NR_CHUNKS_LOW; i++) { + ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (ptr[i] == MAP_FAILED) { + if (validate_lower_address_hint()) + return 1; + break; + } + + if (validate_addr(ptr[i], 0)) + return 1; + } + lchunks = i; + + for (i = 0; i < NR_CHUNKS_HIGH; i++) { + hint = hind_addr(); + hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (hptr[i] == MAP_FAILED) + break; + + if (validate_addr(hptr[i], 1)) + return 1; + } + hchunks = i; + + for (i = 0; i < lchunks; i++) + munmap(ptr[i], MAP_CHUNK_SIZE); + + for (i = 0; i < hchunks; i++) + munmap(hptr[i], MAP_CHUNK_SIZE); + + return 0; +} diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c new file mode 100644 index 000000000000..40e795624ff3 --- /dev/null +++ b/tools/testing/selftests/mm/vm_util.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "../kselftest.h" +#include "vm_util.h" + +#define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" +#define SMAP_FILE_PATH "/proc/self/smaps" +#define MAX_LINE_LENGTH 500 + +uint64_t pagemap_get_entry(int fd, char *start) +{ + const unsigned long pfn = (unsigned long)start / getpagesize(); + uint64_t entry; + int ret; + + ret = pread(fd, &entry, sizeof(entry), pfn * sizeof(entry)); + if (ret != sizeof(entry)) + ksft_exit_fail_msg("reading pagemap failed\n"); + return entry; +} + +bool pagemap_is_softdirty(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + // Check if dirty bit (55th bit) is set + return entry & 0x0080000000000000ull; +} + +bool pagemap_is_swapped(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + return entry & 0x4000000000000000ull; +} + +bool pagemap_is_populated(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + /* Present or swapped. */ + return entry & 0xc000000000000000ull; +} + +unsigned long pagemap_get_pfn(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + /* If present (63th bit), PFN is at bit 0 -- 54. */ + if (entry & 0x8000000000000000ull) + return entry & 0x007fffffffffffffull; + return -1ul; +} + +void clear_softdirty(void) +{ + int ret; + const char *ctrl = "4"; + int fd = open("/proc/self/clear_refs", O_WRONLY); + + if (fd < 0) + ksft_exit_fail_msg("opening clear_refs failed\n"); + ret = write(fd, ctrl, strlen(ctrl)); + close(fd); + if (ret != strlen(ctrl)) + ksft_exit_fail_msg("writing clear_refs failed\n"); +} + +bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len) +{ + while (fgets(buf, len, fp)) { + if (!strncmp(buf, pattern, strlen(pattern))) + return true; + } + return false; +} + +uint64_t read_pmd_pagesize(void) +{ + int fd; + char buf[20]; + ssize_t num_read; + + fd = open(PMD_SIZE_FILE_PATH, O_RDONLY); + if (fd == -1) + ksft_exit_fail_msg("Open hpage_pmd_size failed\n"); + + num_read = read(fd, buf, 19); + if (num_read < 1) { + close(fd); + ksft_exit_fail_msg("Read hpage_pmd_size failed\n"); + } + buf[num_read] = '\0'; + close(fd); + + return strtoul(buf, NULL, 10); +} + +bool __check_huge(void *addr, char *pattern, int nr_hpages, + uint64_t hpage_size) +{ + uint64_t thp = -1; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) + ksft_exit_fail_msg("%s: Pattern is too long\n", __func__); + + fp = fopen(SMAP_FILE_PATH, "r"); + if (!fp) + ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH); + + if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) + goto err_out; + + /* + * Fetch the pattern in the same block and check the number of + * hugepages. + */ + if (!check_for_pattern(fp, pattern, buffer, sizeof(buffer))) + goto err_out; + + snprintf(addr_pattern, MAX_LINE_LENGTH, "%s%%9ld kB", pattern); + + if (sscanf(buffer, addr_pattern, &thp) != 1) + ksft_exit_fail_msg("Reading smap error\n"); + +err_out: + fclose(fp); + return thp == (nr_hpages * (hpage_size >> 10)); +} + +bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size); +} + +bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "FilePmdMapped:", nr_hpages, hpage_size); +} + +bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size); +} diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h new file mode 100644 index 000000000000..1995ee911ef2 --- /dev/null +++ b/tools/testing/selftests/mm/vm_util.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include + +uint64_t pagemap_get_entry(int fd, char *start); +bool pagemap_is_softdirty(int fd, char *start); +bool pagemap_is_swapped(int fd, char *start); +bool pagemap_is_populated(int fd, char *start); +unsigned long pagemap_get_pfn(int fd, char *start); +void clear_softdirty(void); +bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); +uint64_t read_pmd_pagesize(void); +bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); +bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); +bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); diff --git a/tools/testing/selftests/mm/write_hugetlb_memory.sh b/tools/testing/selftests/mm/write_hugetlb_memory.sh new file mode 100644 index 000000000000..70a02301f4c2 --- /dev/null +++ b/tools/testing/selftests/mm/write_hugetlb_memory.sh @@ -0,0 +1,23 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +set -e + +size=$1 +populate=$2 +write=$3 +cgroup=$4 +path=$5 +method=$6 +private=$7 +want_sleep=$8 +reserve=$9 + +echo "Putting task in cgroup '$cgroup'" +echo $$ > ${cgroup_path:-/dev/cgroup/memory}/"$cgroup"/cgroup.procs + +echo "Method is $method" + +set +e +./write_to_hugetlbfs -p "$path" -s "$size" "$write" "$populate" -m "$method" \ + "$private" "$want_sleep" "$reserve" diff --git a/tools/testing/selftests/mm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c new file mode 100644 index 000000000000..6a2caba19ee1 --- /dev/null +++ b/tools/testing/selftests/mm/write_to_hugetlbfs.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This program reserves and uses hugetlb memory, supporting a bunch of + * scenarios needed by the charged_reserved_hugetlb.sh test. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Global definitions. */ +enum method { + HUGETLBFS, + MMAP_MAP_HUGETLB, + SHM, + MAX_METHOD +}; + + +/* Global variables. */ +static const char *self; +static char *shmaddr; +static int shmid; + +/* + * Show usage and exit. + */ +static void exit_usage(void) +{ + printf("Usage: %s -p -s " + "[-m <0=hugetlbfs | 1=mmap(MAP_HUGETLB)>] [-l] [-r] " + "[-o] [-w] [-n]\n", + self); + exit(EXIT_FAILURE); +} + +void sig_handler(int signo) +{ + printf("Received %d.\n", signo); + if (signo == SIGINT) { + printf("Deleting the memory\n"); + if (shmdt((const void *)shmaddr) != 0) { + perror("Detach failure"); + shmctl(shmid, IPC_RMID, NULL); + exit(4); + } + + shmctl(shmid, IPC_RMID, NULL); + printf("Done deleting the memory\n"); + } + exit(2); +} + +int main(int argc, char **argv) +{ + int fd = 0; + int key = 0; + int *ptr = NULL; + int c = 0; + int size = 0; + char path[256] = ""; + enum method method = MAX_METHOD; + int want_sleep = 0, private = 0; + int populate = 0; + int write = 0; + int reserve = 1; + + if (signal(SIGINT, sig_handler) == SIG_ERR) + err(1, "\ncan't catch SIGINT\n"); + + /* Parse command-line arguments. */ + setvbuf(stdout, NULL, _IONBF, 0); + self = argv[0]; + + while ((c = getopt(argc, argv, "s:p:m:owlrn")) != -1) { + switch (c) { + case 's': + size = atoi(optarg); + break; + case 'p': + strncpy(path, optarg, sizeof(path)); + break; + case 'm': + if (atoi(optarg) >= MAX_METHOD) { + errno = EINVAL; + perror("Invalid -m."); + exit_usage(); + } + method = atoi(optarg); + break; + case 'o': + populate = 1; + break; + case 'w': + write = 1; + break; + case 'l': + want_sleep = 1; + break; + case 'r': + private + = 1; + break; + case 'n': + reserve = 0; + break; + default: + errno = EINVAL; + perror("Invalid arg"); + exit_usage(); + } + } + + if (strncmp(path, "", sizeof(path)) != 0) { + printf("Writing to this path: %s\n", path); + } else { + errno = EINVAL; + perror("path not found"); + exit_usage(); + } + + if (size != 0) { + printf("Writing this size: %d\n", size); + } else { + errno = EINVAL; + perror("size not found"); + exit_usage(); + } + + if (!populate) + printf("Not populating.\n"); + else + printf("Populating.\n"); + + if (!write) + printf("Not writing to memory.\n"); + + if (method == MAX_METHOD) { + errno = EINVAL; + perror("-m Invalid"); + exit_usage(); + } else + printf("Using method=%d\n", method); + + if (!private) + printf("Shared mapping.\n"); + else + printf("Private mapping.\n"); + + if (!reserve) + printf("NO_RESERVE mapping.\n"); + else + printf("RESERVE mapping.\n"); + + switch (method) { + case HUGETLBFS: + printf("Allocating using HUGETLBFS.\n"); + fd = open(path, O_CREAT | O_RDWR, 0777); + if (fd == -1) + err(1, "Failed to open file."); + + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + (private ? MAP_PRIVATE : MAP_SHARED) | + (populate ? MAP_POPULATE : 0) | + (reserve ? 0 : MAP_NORESERVE), + fd, 0); + + if (ptr == MAP_FAILED) { + close(fd); + err(1, "Error mapping the file"); + } + break; + case MMAP_MAP_HUGETLB: + printf("Allocating using MAP_HUGETLB.\n"); + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + (private ? (MAP_PRIVATE | MAP_ANONYMOUS) : + MAP_SHARED) | + MAP_HUGETLB | (populate ? MAP_POPULATE : 0) | + (reserve ? 0 : MAP_NORESERVE), + -1, 0); + + if (ptr == MAP_FAILED) + err(1, "mmap"); + + printf("Returned address is %p\n", ptr); + break; + case SHM: + printf("Allocating using SHM.\n"); + shmid = shmget(key, size, + SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); + if (shmid < 0) { + shmid = shmget(++key, size, + SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); + if (shmid < 0) + err(1, "shmget"); + } + printf("shmid: 0x%x, shmget key:%d\n", shmid, key); + + ptr = shmat(shmid, NULL, 0); + if (ptr == (int *)-1) { + perror("Shared memory attach failure"); + shmctl(shmid, IPC_RMID, NULL); + exit(2); + } + printf("shmaddr: %p\n", ptr); + + break; + default: + errno = EINVAL; + err(1, "Invalid method."); + } + + if (write) { + printf("Writing to memory.\n"); + memset(ptr, 1, size); + } + + if (want_sleep) { + /* Signal to caller that we're done. */ + printf("DONE\n"); + + /* Hold memory until external kill signal is delivered. */ + while (1) + sleep(100); + } + + if (method == HUGETLBFS) + close(fd); + + return 0; +} diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore deleted file mode 100644 index 1f8c36a9fa10..000000000000 --- a/tools/testing/selftests/vm/.gitignore +++ /dev/null @@ -1,38 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -cow -hugepage-mmap -hugepage-mremap -hugepage-shm -hugepage-vmemmap -hugetlb-madvise -khugepaged -map_hugetlb -map_populate -thuge-gen -compaction_test -migration -mlock2-tests -mrelease_test -mremap_dontunmap -mremap_test -on-fault-limit -transhuge-stress -protection_keys -protection_keys_32 -protection_keys_64 -madv_populate -userfaultfd -mlock-intersect-test -mlock-random-test -virtual_address_range -gup_test -va_128TBswitch -map_fixed_noreplace -write_to_hugetlbfs -hmm-tests -memfd_secret -soft-dirty -split_huge_page_test -ksm_tests -local_config.h -local_config.mk diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile deleted file mode 100644 index 89c14e41bd43..000000000000 --- a/tools/testing/selftests/vm/Makefile +++ /dev/null @@ -1,180 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# Makefile for vm selftests - -LOCAL_HDRS += $(selfdir)/vm/local_config.h $(top_srcdir)/mm/gup_test.h - -include local_config.mk - -uname_M := $(shell uname -m 2>/dev/null || echo not) -MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') - -# Without this, failed build products remain, with up-to-date timestamps, -# thus tricking Make (and you!) into believing that All Is Well, in subsequent -# make invocations: -.DELETE_ON_ERROR: - -# Avoid accidental wrong builds, due to built-in rules working just a little -# bit too well--but not quite as well as required for our situation here. -# -# In other words, "make userfaultfd" is supposed to fail to build at all, -# because this Makefile only supports either "make" (all), or "make /full/path". -# However, the built-in rules, if not suppressed, will pick up CFLAGS and the -# initial LDLIBS (but not the target-specific LDLIBS, because those are only -# set for the full path target!). This causes it to get pretty far into building -# things despite using incorrect values such as an *occasionally* incomplete -# LDLIBS. -MAKEFLAGS += --no-builtin-rules - -CFLAGS = -Wall -I $(top_srcdir) -I $(top_srcdir)/usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES) -LDLIBS = -lrt -lpthread -TEST_GEN_FILES = cow -TEST_GEN_FILES += compaction_test -TEST_GEN_FILES += gup_test -TEST_GEN_FILES += hmm-tests -TEST_GEN_FILES += hugetlb-madvise -TEST_GEN_FILES += hugepage-mmap -TEST_GEN_FILES += hugepage-mremap -TEST_GEN_FILES += hugepage-shm -TEST_GEN_FILES += hugepage-vmemmap -TEST_GEN_FILES += khugepaged -TEST_GEN_PROGS = madv_populate -TEST_GEN_FILES += map_fixed_noreplace -TEST_GEN_FILES += map_hugetlb -TEST_GEN_FILES += map_populate -TEST_GEN_FILES += memfd_secret -TEST_GEN_FILES += migration -TEST_GEN_FILES += mlock-random-test -TEST_GEN_FILES += mlock2-tests -TEST_GEN_FILES += mrelease_test -TEST_GEN_FILES += mremap_dontunmap -TEST_GEN_FILES += mremap_test -TEST_GEN_FILES += on-fault-limit -TEST_GEN_FILES += thuge-gen -TEST_GEN_FILES += transhuge-stress -TEST_GEN_FILES += userfaultfd -TEST_GEN_PROGS += soft-dirty -TEST_GEN_PROGS += split_huge_page_test -TEST_GEN_FILES += ksm_tests -TEST_GEN_PROGS += ksm_functional_tests - -ifeq ($(MACHINE),x86_64) -CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32) -CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_program.c) -CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie) - -VMTARGETS := protection_keys -BINARIES_32 := $(VMTARGETS:%=%_32) -BINARIES_64 := $(VMTARGETS:%=%_64) - -ifeq ($(CAN_BUILD_WITH_NOPIE),1) -CFLAGS += -no-pie -endif - -ifeq ($(CAN_BUILD_I386),1) -TEST_GEN_FILES += $(BINARIES_32) -endif - -ifeq ($(CAN_BUILD_X86_64),1) -TEST_GEN_FILES += $(BINARIES_64) -endif -else - -ifneq (,$(findstring $(MACHINE),ppc64)) -TEST_GEN_FILES += protection_keys -endif - -endif - -ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64)) -TEST_GEN_FILES += va_128TBswitch -TEST_GEN_FILES += virtual_address_range -TEST_GEN_FILES += write_to_hugetlbfs -endif - -TEST_PROGS := run_vmtests.sh - -TEST_FILES := test_vmalloc.sh -TEST_FILES += test_hmm.sh -TEST_FILES += va_128TBswitch.sh - -include ../lib.mk - -$(OUTPUT)/cow: vm_util.c -$(OUTPUT)/khugepaged: vm_util.c -$(OUTPUT)/ksm_functional_tests: vm_util.c -$(OUTPUT)/madv_populate: vm_util.c -$(OUTPUT)/soft-dirty: vm_util.c -$(OUTPUT)/split_huge_page_test: vm_util.c -$(OUTPUT)/userfaultfd: vm_util.c - -ifeq ($(MACHINE),x86_64) -BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) -BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) - -define gen-target-rule-32 -$(1) $(1)_32: $(OUTPUT)/$(1)_32 -.PHONY: $(1) $(1)_32 -endef - -define gen-target-rule-64 -$(1) $(1)_64: $(OUTPUT)/$(1)_64 -.PHONY: $(1) $(1)_64 -endef - -ifeq ($(CAN_BUILD_I386),1) -$(BINARIES_32): CFLAGS += -m32 -mxsave -$(BINARIES_32): LDLIBS += -lrt -ldl -lm -$(BINARIES_32): $(OUTPUT)/%_32: %.c - $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ -$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-32,$(t)))) -endif - -ifeq ($(CAN_BUILD_X86_64),1) -$(BINARIES_64): CFLAGS += -m64 -mxsave -$(BINARIES_64): LDLIBS += -lrt -ldl -$(BINARIES_64): $(OUTPUT)/%_64: %.c - $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ -$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-64,$(t)))) -endif - -# x86_64 users should be encouraged to install 32-bit libraries -ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01) -all: warn_32bit_failure - -warn_32bit_failure: - @echo "Warning: you seem to have a broken 32-bit build" 2>&1; \ - echo "environment. This will reduce test coverage of 64-bit" 2>&1; \ - echo "kernels. If you are using a Debian-like distribution," 2>&1; \ - echo "try:"; 2>&1; \ - echo ""; \ - echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \ - echo ""; \ - echo "If you are using a Fedora-like distribution, try:"; \ - echo ""; \ - echo " yum install glibc-devel.*i686"; \ - exit 0; -endif -endif - -# cow_EXTRA_LIBS may get set in local_config.mk, or it may be left empty. -$(OUTPUT)/cow: LDLIBS += $(COW_EXTRA_LIBS) - -$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap - -$(OUTPUT)/ksm_tests: LDLIBS += -lnuma - -$(OUTPUT)/migration: LDLIBS += -lnuma - -local_config.mk local_config.h: check_config.sh - /bin/sh ./check_config.sh $(CC) - -EXTRA_CLEAN += local_config.mk local_config.h - -ifeq ($(COW_EXTRA_LIBS),) -all: warn_missing_liburing - -warn_missing_liburing: - @echo ; \ - echo "Warning: missing liburing support. Some COW tests will be skipped." ; \ - echo -endif diff --git a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh deleted file mode 100644 index a5cb4b09a46c..000000000000 --- a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh +++ /dev/null @@ -1,584 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -set -e - -if [[ $(id -u) -ne 0 ]]; then - echo "This test must be run as root. Skipping..." - exit $ksft_skip -fi - -fault_limit_file=limit_in_bytes -reservation_limit_file=rsvd.limit_in_bytes -fault_usage_file=usage_in_bytes -reservation_usage_file=rsvd.usage_in_bytes - -if [[ "$1" == "-cgroup-v2" ]]; then - cgroup2=1 - fault_limit_file=max - reservation_limit_file=rsvd.max - fault_usage_file=current - reservation_usage_file=rsvd.current -fi - -if [[ $cgroup2 ]]; then - cgroup_path=$(mount -t cgroup2 | head -1 | awk -e '{print $3}') - if [[ -z "$cgroup_path" ]]; then - cgroup_path=/dev/cgroup/memory - mount -t cgroup2 none $cgroup_path - do_umount=1 - fi - echo "+hugetlb" >$cgroup_path/cgroup.subtree_control -else - cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}') - if [[ -z "$cgroup_path" ]]; then - cgroup_path=/dev/cgroup/memory - mount -t cgroup memory,hugetlb $cgroup_path - do_umount=1 - fi -fi -export cgroup_path - -function cleanup() { - if [[ $cgroup2 ]]; then - echo $$ >$cgroup_path/cgroup.procs - else - echo $$ >$cgroup_path/tasks - fi - - if [[ -e /mnt/huge ]]; then - rm -rf /mnt/huge/* - umount /mnt/huge || echo error - rmdir /mnt/huge - fi - if [[ -e $cgroup_path/hugetlb_cgroup_test ]]; then - rmdir $cgroup_path/hugetlb_cgroup_test - fi - if [[ -e $cgroup_path/hugetlb_cgroup_test1 ]]; then - rmdir $cgroup_path/hugetlb_cgroup_test1 - fi - if [[ -e $cgroup_path/hugetlb_cgroup_test2 ]]; then - rmdir $cgroup_path/hugetlb_cgroup_test2 - fi - echo 0 >/proc/sys/vm/nr_hugepages - echo CLEANUP DONE -} - -function expect_equal() { - local expected="$1" - local actual="$2" - local error="$3" - - if [[ "$expected" != "$actual" ]]; then - echo "expected ($expected) != actual ($actual): $3" - cleanup - exit 1 - fi -} - -function get_machine_hugepage_size() { - hpz=$(grep -i hugepagesize /proc/meminfo) - kb=${hpz:14:-3} - mb=$(($kb / 1024)) - echo $mb -} - -MB=$(get_machine_hugepage_size) - -function setup_cgroup() { - local name="$1" - local cgroup_limit="$2" - local reservation_limit="$3" - - mkdir $cgroup_path/$name - - echo writing cgroup limit: "$cgroup_limit" - echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file - - echo writing reseravation limit: "$reservation_limit" - echo "$reservation_limit" > \ - $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file - - if [ -e "$cgroup_path/$name/cpuset.cpus" ]; then - echo 0 >$cgroup_path/$name/cpuset.cpus - fi - if [ -e "$cgroup_path/$name/cpuset.mems" ]; then - echo 0 >$cgroup_path/$name/cpuset.mems - fi -} - -function wait_for_hugetlb_memory_to_get_depleted() { - local cgroup="$1" - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" - # Wait for hugetlbfs memory to get depleted. - while [ $(cat $path) != 0 ]; do - echo Waiting for hugetlb memory to get depleted. - cat $path - sleep 0.5 - done -} - -function wait_for_hugetlb_memory_to_get_reserved() { - local cgroup="$1" - local size="$2" - - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" - # Wait for hugetlbfs memory to get written. - while [ $(cat $path) != $size ]; do - echo Waiting for hugetlb memory reservation to reach size $size. - cat $path - sleep 0.5 - done -} - -function wait_for_hugetlb_memory_to_get_written() { - local cgroup="$1" - local size="$2" - - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file" - # Wait for hugetlbfs memory to get written. - while [ $(cat $path) != $size ]; do - echo Waiting for hugetlb memory to reach size $size. - cat $path - sleep 0.5 - done -} - -function write_hugetlbfs_and_get_usage() { - local cgroup="$1" - local size="$2" - local populate="$3" - local write="$4" - local path="$5" - local method="$6" - local private="$7" - local expect_failure="$8" - local reserve="$9" - - # Function return values. - reservation_failed=0 - oom_killed=0 - hugetlb_difference=0 - reserved_difference=0 - - local hugetlb_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file - local reserved_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file - - local hugetlb_before=$(cat $hugetlb_usage) - local reserved_before=$(cat $reserved_usage) - - echo - echo Starting: - echo hugetlb_usage="$hugetlb_before" - echo reserved_usage="$reserved_before" - echo expect_failure is "$expect_failure" - - output=$(mktemp) - set +e - if [[ "$method" == "1" ]] || [[ "$method" == 2 ]] || - [[ "$private" == "-r" ]] && [[ "$expect_failure" != 1 ]]; then - - bash write_hugetlb_memory.sh "$size" "$populate" "$write" \ - "$cgroup" "$path" "$method" "$private" "-l" "$reserve" 2>&1 | tee $output & - - local write_result=$? - local write_pid=$! - - until grep -q -i "DONE" $output; do - echo waiting for DONE signal. - if ! ps $write_pid > /dev/null - then - echo "FAIL: The write died" - cleanup - exit 1 - fi - sleep 0.5 - done - - echo ================= write_hugetlb_memory.sh output is: - cat $output - echo ================= end output. - - if [[ "$populate" == "-o" ]] || [[ "$write" == "-w" ]]; then - wait_for_hugetlb_memory_to_get_written "$cgroup" "$size" - elif [[ "$reserve" != "-n" ]]; then - wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size" - else - # This case doesn't produce visible effects, but we still have - # to wait for the async process to start and execute... - sleep 0.5 - fi - - echo write_result is $write_result - else - bash write_hugetlb_memory.sh "$size" "$populate" "$write" \ - "$cgroup" "$path" "$method" "$private" "$reserve" - local write_result=$? - - if [[ "$reserve" != "-n" ]]; then - wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size" - fi - fi - set -e - - if [[ "$write_result" == 1 ]]; then - reservation_failed=1 - fi - - # On linus/master, the above process gets SIGBUS'd on oomkill, with - # return code 135. On earlier kernels, it gets actual oomkill, with return - # code 137, so just check for both conditions in case we're testing - # against an earlier kernel. - if [[ "$write_result" == 135 ]] || [[ "$write_result" == 137 ]]; then - oom_killed=1 - fi - - local hugetlb_after=$(cat $hugetlb_usage) - local reserved_after=$(cat $reserved_usage) - - echo After write: - echo hugetlb_usage="$hugetlb_after" - echo reserved_usage="$reserved_after" - - hugetlb_difference=$(($hugetlb_after - $hugetlb_before)) - reserved_difference=$(($reserved_after - $reserved_before)) -} - -function cleanup_hugetlb_memory() { - set +e - local cgroup="$1" - if [[ "$(pgrep -f write_to_hugetlbfs)" != "" ]]; then - echo killing write_to_hugetlbfs - killall -2 write_to_hugetlbfs - wait_for_hugetlb_memory_to_get_depleted $cgroup - fi - set -e - - if [[ -e /mnt/huge ]]; then - rm -rf /mnt/huge/* - umount /mnt/huge - rmdir /mnt/huge - fi -} - -function run_test() { - local size=$(($1 * ${MB} * 1024 * 1024)) - local populate="$2" - local write="$3" - local cgroup_limit=$(($4 * ${MB} * 1024 * 1024)) - local reservation_limit=$(($5 * ${MB} * 1024 * 1024)) - local nr_hugepages="$6" - local method="$7" - local private="$8" - local expect_failure="$9" - local reserve="${10}" - - # Function return values. - hugetlb_difference=0 - reserved_difference=0 - reservation_failed=0 - oom_killed=0 - - echo nr hugepages = "$nr_hugepages" - echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages - - setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit" - - mkdir -p /mnt/huge - mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge - - write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \ - "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \ - "$reserve" - - cleanup_hugetlb_memory "hugetlb_cgroup_test" - - local final_hugetlb=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$fault_usage_file) - local final_reservation=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$reservation_usage_file) - - echo $hugetlb_difference - echo $reserved_difference - expect_equal "0" "$final_hugetlb" "final hugetlb is not zero" - expect_equal "0" "$final_reservation" "final reservation is not zero" -} - -function run_multiple_cgroup_test() { - local size1="$1" - local populate1="$2" - local write1="$3" - local cgroup_limit1="$4" - local reservation_limit1="$5" - - local size2="$6" - local populate2="$7" - local write2="$8" - local cgroup_limit2="$9" - local reservation_limit2="${10}" - - local nr_hugepages="${11}" - local method="${12}" - local private="${13}" - local expect_failure="${14}" - local reserve="${15}" - - # Function return values. - hugetlb_difference1=0 - reserved_difference1=0 - reservation_failed1=0 - oom_killed1=0 - - hugetlb_difference2=0 - reserved_difference2=0 - reservation_failed2=0 - oom_killed2=0 - - echo nr hugepages = "$nr_hugepages" - echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages - - setup_cgroup "hugetlb_cgroup_test1" "$cgroup_limit1" "$reservation_limit1" - setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2" - - mkdir -p /mnt/huge - mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge - - write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \ - "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \ - "$expect_failure" "$reserve" - - hugetlb_difference1=$hugetlb_difference - reserved_difference1=$reserved_difference - reservation_failed1=$reservation_failed - oom_killed1=$oom_killed - - local cgroup1_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$fault_usage_file - local cgroup1_reservation_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$reservation_usage_file - local cgroup2_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$fault_usage_file - local cgroup2_reservation_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$reservation_usage_file - - local usage_before_second_write=$(cat $cgroup1_hugetlb_usage) - local reservation_usage_before_second_write=$(cat $cgroup1_reservation_usage) - - write_hugetlbfs_and_get_usage "hugetlb_cgroup_test2" "$size2" \ - "$populate2" "$write2" "/mnt/huge/test2" "$method" "$private" \ - "$expect_failure" "$reserve" - - hugetlb_difference2=$hugetlb_difference - reserved_difference2=$reserved_difference - reservation_failed2=$reservation_failed - oom_killed2=$oom_killed - - expect_equal "$usage_before_second_write" \ - "$(cat $cgroup1_hugetlb_usage)" "Usage changed." - expect_equal "$reservation_usage_before_second_write" \ - "$(cat $cgroup1_reservation_usage)" "Reservation usage changed." - - cleanup_hugetlb_memory - - local final_hugetlb=$(cat $cgroup1_hugetlb_usage) - local final_reservation=$(cat $cgroup1_reservation_usage) - - expect_equal "0" "$final_hugetlb" \ - "hugetlbt_cgroup_test1 final hugetlb is not zero" - expect_equal "0" "$final_reservation" \ - "hugetlbt_cgroup_test1 final reservation is not zero" - - local final_hugetlb=$(cat $cgroup2_hugetlb_usage) - local final_reservation=$(cat $cgroup2_reservation_usage) - - expect_equal "0" "$final_hugetlb" \ - "hugetlb_cgroup_test2 final hugetlb is not zero" - expect_equal "0" "$final_reservation" \ - "hugetlb_cgroup_test2 final reservation is not zero" -} - -cleanup - -for populate in "" "-o"; do - for method in 0 1 2; do - for private in "" "-r"; do - for reserve in "" "-n"; do - - # Skip mmap(MAP_HUGETLB | MAP_SHARED). Doesn't seem to be supported. - if [[ "$method" == 1 ]] && [[ "$private" == "" ]]; then - continue - fi - - # Skip populated shmem tests. Doesn't seem to be supported. - if [[ "$method" == 2"" ]] && [[ "$populate" == "-o" ]]; then - continue - fi - - if [[ "$method" == 2"" ]] && [[ "$reserve" == "-n" ]]; then - continue - fi - - cleanup - echo - echo - echo - echo Test normal case. - echo private=$private, populate=$populate, method=$method, reserve=$reserve - run_test 5 "$populate" "" 10 10 10 "$method" "$private" "0" "$reserve" - - echo Memory charged to hugtlb=$hugetlb_difference - echo Memory charged to reservation=$reserved_difference - - if [[ "$populate" == "-o" ]]; then - expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \ - "Reserved memory charged to hugetlb cgroup." - else - expect_equal "0" "$hugetlb_difference" \ - "Reserved memory charged to hugetlb cgroup." - fi - - if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then - expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \ - "Reserved memory not charged to reservation usage." - else - expect_equal "0" "$reserved_difference" \ - "Reserved memory not charged to reservation usage." - fi - - echo 'PASS' - - cleanup - echo - echo - echo - echo Test normal case with write. - echo private=$private, populate=$populate, method=$method, reserve=$reserve - run_test 5 "$populate" '-w' 5 5 10 "$method" "$private" "0" "$reserve" - - echo Memory charged to hugtlb=$hugetlb_difference - echo Memory charged to reservation=$reserved_difference - - expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \ - "Reserved memory charged to hugetlb cgroup." - - expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \ - "Reserved memory not charged to reservation usage." - - echo 'PASS' - - cleanup - continue - echo - echo - echo - echo Test more than reservation case. - echo private=$private, populate=$populate, method=$method, reserve=$reserve - - if [ "$reserve" != "-n" ]; then - run_test "5" "$populate" '' "10" "2" "10" "$method" "$private" "1" \ - "$reserve" - - expect_equal "1" "$reservation_failed" "Reservation succeeded." - fi - - echo 'PASS' - - cleanup - - echo - echo - echo - echo Test more than cgroup limit case. - echo private=$private, populate=$populate, method=$method, reserve=$reserve - - # Not sure if shm memory can be cleaned up when the process gets sigbus'd. - if [[ "$method" != 2 ]]; then - run_test 5 "$populate" "-w" 2 10 10 "$method" "$private" "1" "$reserve" - - expect_equal "1" "$oom_killed" "Not oom killed." - fi - echo 'PASS' - - cleanup - - echo - echo - echo - echo Test normal case, multiple cgroups. - echo private=$private, populate=$populate, method=$method, reserve=$reserve - run_multiple_cgroup_test "3" "$populate" "" "10" "10" "5" \ - "$populate" "" "10" "10" "10" \ - "$method" "$private" "0" "$reserve" - - echo Memory charged to hugtlb1=$hugetlb_difference1 - echo Memory charged to reservation1=$reserved_difference1 - echo Memory charged to hugtlb2=$hugetlb_difference2 - echo Memory charged to reservation2=$reserved_difference2 - - if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then - expect_equal "3" "$reserved_difference1" \ - "Incorrect reservations charged to cgroup 1." - - expect_equal "5" "$reserved_difference2" \ - "Incorrect reservation charged to cgroup 2." - - else - expect_equal "0" "$reserved_difference1" \ - "Incorrect reservations charged to cgroup 1." - - expect_equal "0" "$reserved_difference2" \ - "Incorrect reservation charged to cgroup 2." - fi - - if [[ "$populate" == "-o" ]]; then - expect_equal "3" "$hugetlb_difference1" \ - "Incorrect hugetlb charged to cgroup 1." - - expect_equal "5" "$hugetlb_difference2" \ - "Incorrect hugetlb charged to cgroup 2." - - else - expect_equal "0" "$hugetlb_difference1" \ - "Incorrect hugetlb charged to cgroup 1." - - expect_equal "0" "$hugetlb_difference2" \ - "Incorrect hugetlb charged to cgroup 2." - fi - echo 'PASS' - - cleanup - echo - echo - echo - echo Test normal case with write, multiple cgroups. - echo private=$private, populate=$populate, method=$method, reserve=$reserve - run_multiple_cgroup_test "3" "$populate" "-w" "10" "10" "5" \ - "$populate" "-w" "10" "10" "10" \ - "$method" "$private" "0" "$reserve" - - echo Memory charged to hugtlb1=$hugetlb_difference1 - echo Memory charged to reservation1=$reserved_difference1 - echo Memory charged to hugtlb2=$hugetlb_difference2 - echo Memory charged to reservation2=$reserved_difference2 - - expect_equal "3" "$hugetlb_difference1" \ - "Incorrect hugetlb charged to cgroup 1." - - expect_equal "3" "$reserved_difference1" \ - "Incorrect reservation charged to cgroup 1." - - expect_equal "5" "$hugetlb_difference2" \ - "Incorrect hugetlb charged to cgroup 2." - - expect_equal "5" "$reserved_difference2" \ - "Incorrected reservation charged to cgroup 2." - echo 'PASS' - - cleanup - - done # reserve - done # private - done # populate -done # method - -if [[ $do_umount ]]; then - umount $cgroup_path - rmdir $cgroup_path -fi diff --git a/tools/testing/selftests/vm/check_config.sh b/tools/testing/selftests/vm/check_config.sh deleted file mode 100644 index bcba3af0acea..000000000000 --- a/tools/testing/selftests/vm/check_config.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# -# Probe for libraries and create header files to record the results. Both C -# header files and Makefile include fragments are created. - -OUTPUT_H_FILE=local_config.h -OUTPUT_MKFILE=local_config.mk - -tmpname=$(mktemp) -tmpfile_c=${tmpname}.c -tmpfile_o=${tmpname}.o - -# liburing -echo "#include " > $tmpfile_c -echo "#include " >> $tmpfile_c -echo "int func(void) { return 0; }" >> $tmpfile_c - -CC=${1:?"Usage: $0 # example compiler: gcc"} -$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1 - -if [ -f $tmpfile_o ]; then - echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE - echo "COW_EXTRA_LIBS = -luring" > $OUTPUT_MKFILE -else - echo "// No liburing support found" > $OUTPUT_H_FILE - echo "# No liburing support found, so:" > $OUTPUT_MKFILE - echo "COW_EXTRA_LIBS = " >> $OUTPUT_MKFILE -fi - -rm ${tmpname}.* diff --git a/tools/testing/selftests/vm/compaction_test.c b/tools/testing/selftests/vm/compaction_test.c deleted file mode 100644 index 9b420140ba2b..000000000000 --- a/tools/testing/selftests/vm/compaction_test.c +++ /dev/null @@ -1,231 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * - * A test for the patch "Allow compaction of unevictable pages". - * With this patch we should be able to allocate at least 1/4 - * of RAM in huge pages. Without the patch much less is - * allocated. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" - -#define MAP_SIZE_MB 100 -#define MAP_SIZE (MAP_SIZE_MB * 1024 * 1024) - -struct map_list { - void *map; - struct map_list *next; -}; - -int read_memory_info(unsigned long *memfree, unsigned long *hugepagesize) -{ - char buffer[256] = {0}; - char *cmd = "cat /proc/meminfo | grep -i memfree | grep -o '[0-9]*'"; - FILE *cmdfile = popen(cmd, "r"); - - if (!(fgets(buffer, sizeof(buffer), cmdfile))) { - perror("Failed to read meminfo\n"); - return -1; - } - - pclose(cmdfile); - - *memfree = atoll(buffer); - cmd = "cat /proc/meminfo | grep -i hugepagesize | grep -o '[0-9]*'"; - cmdfile = popen(cmd, "r"); - - if (!(fgets(buffer, sizeof(buffer), cmdfile))) { - perror("Failed to read meminfo\n"); - return -1; - } - - pclose(cmdfile); - *hugepagesize = atoll(buffer); - - return 0; -} - -int prereq(void) -{ - char allowed; - int fd; - - fd = open("/proc/sys/vm/compact_unevictable_allowed", - O_RDONLY | O_NONBLOCK); - if (fd < 0) { - perror("Failed to open\n" - "/proc/sys/vm/compact_unevictable_allowed\n"); - return -1; - } - - if (read(fd, &allowed, sizeof(char)) != sizeof(char)) { - perror("Failed to read from\n" - "/proc/sys/vm/compact_unevictable_allowed\n"); - close(fd); - return -1; - } - - close(fd); - if (allowed == '1') - return 0; - - return -1; -} - -int check_compaction(unsigned long mem_free, unsigned int hugepage_size) -{ - int fd; - int compaction_index = 0; - char initial_nr_hugepages[10] = {0}; - char nr_hugepages[10] = {0}; - - /* We want to test with 80% of available memory. Else, OOM killer comes - in to play */ - mem_free = mem_free * 0.8; - - fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK); - if (fd < 0) { - perror("Failed to open /proc/sys/vm/nr_hugepages"); - return -1; - } - - if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) { - perror("Failed to read from /proc/sys/vm/nr_hugepages"); - goto close_fd; - } - - /* Start with the initial condition of 0 huge pages*/ - if (write(fd, "0", sizeof(char)) != sizeof(char)) { - perror("Failed to write 0 to /proc/sys/vm/nr_hugepages\n"); - goto close_fd; - } - - lseek(fd, 0, SEEK_SET); - - /* Request a large number of huge pages. The Kernel will allocate - as much as it can */ - if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) { - perror("Failed to write 100000 to /proc/sys/vm/nr_hugepages\n"); - goto close_fd; - } - - lseek(fd, 0, SEEK_SET); - - if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) { - perror("Failed to re-read from /proc/sys/vm/nr_hugepages\n"); - goto close_fd; - } - - /* We should have been able to request at least 1/3 rd of the memory in - huge pages */ - compaction_index = mem_free/(atoi(nr_hugepages) * hugepage_size); - - if (compaction_index > 3) { - printf("No of huge pages allocated = %d\n", - (atoi(nr_hugepages))); - fprintf(stderr, "ERROR: Less that 1/%d of memory is available\n" - "as huge pages\n", compaction_index); - goto close_fd; - } - - printf("No of huge pages allocated = %d\n", - (atoi(nr_hugepages))); - - lseek(fd, 0, SEEK_SET); - - if (write(fd, initial_nr_hugepages, strlen(initial_nr_hugepages)) - != strlen(initial_nr_hugepages)) { - perror("Failed to write value to /proc/sys/vm/nr_hugepages\n"); - goto close_fd; - } - - close(fd); - return 0; - - close_fd: - close(fd); - printf("Not OK. Compaction test failed."); - return -1; -} - - -int main(int argc, char **argv) -{ - struct rlimit lim; - struct map_list *list, *entry; - size_t page_size, i; - void *map = NULL; - unsigned long mem_free = 0; - unsigned long hugepage_size = 0; - long mem_fragmentable_MB = 0; - - if (prereq() != 0) { - printf("Either the sysctl compact_unevictable_allowed is not\n" - "set to 1 or couldn't read the proc file.\n" - "Skipping the test\n"); - return KSFT_SKIP; - } - - lim.rlim_cur = RLIM_INFINITY; - lim.rlim_max = RLIM_INFINITY; - if (setrlimit(RLIMIT_MEMLOCK, &lim)) { - perror("Failed to set rlimit:\n"); - return -1; - } - - page_size = getpagesize(); - - list = NULL; - - if (read_memory_info(&mem_free, &hugepage_size) != 0) { - printf("ERROR: Cannot read meminfo\n"); - return -1; - } - - mem_fragmentable_MB = mem_free * 0.8 / 1024; - - while (mem_fragmentable_MB > 0) { - map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0); - if (map == MAP_FAILED) - break; - - entry = malloc(sizeof(struct map_list)); - if (!entry) { - munmap(map, MAP_SIZE); - break; - } - entry->map = map; - entry->next = list; - list = entry; - - /* Write something (in this case the address of the map) to - * ensure that KSM can't merge the mapped pages - */ - for (i = 0; i < MAP_SIZE; i += page_size) - *(unsigned long *)(map + i) = (unsigned long)map + i; - - mem_fragmentable_MB -= MAP_SIZE_MB; - } - - for (entry = list; entry != NULL; entry = entry->next) { - munmap(entry->map, MAP_SIZE); - if (!entry->next) - break; - entry = entry->next; - } - - if (check_compaction(mem_free, hugepage_size) == 0) - return 0; - - return -1; -} diff --git a/tools/testing/selftests/vm/config b/tools/testing/selftests/vm/config deleted file mode 100644 index be087c4bc396..000000000000 --- a/tools/testing/selftests/vm/config +++ /dev/null @@ -1,8 +0,0 @@ -CONFIG_SYSVIPC=y -CONFIG_USERFAULTFD=y -CONFIG_TEST_VMALLOC=m -CONFIG_DEVICE_PRIVATE=y -CONFIG_TEST_HMM=m -CONFIG_GUP_TEST=y -CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_MEM_SOFT_DIRTY=y diff --git a/tools/testing/selftests/vm/cow.c b/tools/testing/selftests/vm/cow.c deleted file mode 100644 index 16216d893d96..000000000000 --- a/tools/testing/selftests/vm/cow.c +++ /dev/null @@ -1,1764 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * COW (Copy On Write) tests. - * - * Copyright 2022, Red Hat, Inc. - * - * Author(s): David Hildenbrand - */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "local_config.h" -#ifdef LOCAL_CONFIG_HAVE_LIBURING -#include -#endif /* LOCAL_CONFIG_HAVE_LIBURING */ - -#include "../../../../mm/gup_test.h" -#include "../kselftest.h" -#include "vm_util.h" - -#ifndef MADV_COLLAPSE -#define MADV_COLLAPSE 25 -#endif - -static size_t pagesize; -static int pagemap_fd; -static size_t thpsize; -static int nr_hugetlbsizes; -static size_t hugetlbsizes[10]; -static int gup_fd; -static bool has_huge_zeropage; - -static void detect_thpsize(void) -{ - int fd = open("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", - O_RDONLY); - size_t size = 0; - char buf[15]; - int ret; - - if (fd < 0) - return; - - ret = pread(fd, buf, sizeof(buf), 0); - if (ret > 0 && ret < sizeof(buf)) { - buf[ret] = 0; - - size = strtoul(buf, NULL, 10); - if (size < pagesize) - size = 0; - if (size > 0) { - thpsize = size; - ksft_print_msg("[INFO] detected THP size: %zu KiB\n", - thpsize / 1024); - } - } - - close(fd); -} - -static void detect_huge_zeropage(void) -{ - int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page", - O_RDONLY); - size_t enabled = 0; - char buf[15]; - int ret; - - if (fd < 0) - return; - - ret = pread(fd, buf, sizeof(buf), 0); - if (ret > 0 && ret < sizeof(buf)) { - buf[ret] = 0; - - enabled = strtoul(buf, NULL, 10); - if (enabled == 1) { - has_huge_zeropage = true; - ksft_print_msg("[INFO] huge zeropage is enabled\n"); - } - } - - close(fd); -} - -static void detect_hugetlbsizes(void) -{ - DIR *dir = opendir("/sys/kernel/mm/hugepages/"); - - if (!dir) - return; - - while (nr_hugetlbsizes < ARRAY_SIZE(hugetlbsizes)) { - struct dirent *entry = readdir(dir); - size_t kb; - - if (!entry) - break; - if (entry->d_type != DT_DIR) - continue; - if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1) - continue; - hugetlbsizes[nr_hugetlbsizes] = kb * 1024; - nr_hugetlbsizes++; - ksft_print_msg("[INFO] detected hugetlb size: %zu KiB\n", - kb); - } - closedir(dir); -} - -static bool range_is_swapped(void *addr, size_t size) -{ - for (; size; addr += pagesize, size -= pagesize) - if (!pagemap_is_swapped(pagemap_fd, addr)) - return false; - return true; -} - -struct comm_pipes { - int child_ready[2]; - int parent_ready[2]; -}; - -static int setup_comm_pipes(struct comm_pipes *comm_pipes) -{ - if (pipe(comm_pipes->child_ready) < 0) - return -errno; - if (pipe(comm_pipes->parent_ready) < 0) { - close(comm_pipes->child_ready[0]); - close(comm_pipes->child_ready[1]); - return -errno; - } - - return 0; -} - -static void close_comm_pipes(struct comm_pipes *comm_pipes) -{ - close(comm_pipes->child_ready[0]); - close(comm_pipes->child_ready[1]); - close(comm_pipes->parent_ready[0]); - close(comm_pipes->parent_ready[1]); -} - -static int child_memcmp_fn(char *mem, size_t size, - struct comm_pipes *comm_pipes) -{ - char *old = malloc(size); - char buf; - - /* Backup the original content. */ - memcpy(old, mem, size); - - /* Wait until the parent modified the page. */ - write(comm_pipes->child_ready[1], "0", 1); - while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) - ; - - /* See if we still read the old values. */ - return memcmp(old, mem, size); -} - -static int child_vmsplice_memcmp_fn(char *mem, size_t size, - struct comm_pipes *comm_pipes) -{ - struct iovec iov = { - .iov_base = mem, - .iov_len = size, - }; - ssize_t cur, total, transferred; - char *old, *new; - int fds[2]; - char buf; - - old = malloc(size); - new = malloc(size); - - /* Backup the original content. */ - memcpy(old, mem, size); - - if (pipe(fds) < 0) - return -errno; - - /* Trigger a read-only pin. */ - transferred = vmsplice(fds[1], &iov, 1, 0); - if (transferred < 0) - return -errno; - if (transferred == 0) - return -EINVAL; - - /* Unmap it from our page tables. */ - if (munmap(mem, size) < 0) - return -errno; - - /* Wait until the parent modified it. */ - write(comm_pipes->child_ready[1], "0", 1); - while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) - ; - - /* See if we still read the old values via the pipe. */ - for (total = 0; total < transferred; total += cur) { - cur = read(fds[0], new + total, transferred - total); - if (cur < 0) - return -errno; - } - - return memcmp(old, new, transferred); -} - -typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes); - -static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, - child_fn fn) -{ - struct comm_pipes comm_pipes; - char buf; - int ret; - - ret = setup_comm_pipes(&comm_pipes); - if (ret) { - ksft_test_result_fail("pipe() failed\n"); - return; - } - - ret = fork(); - if (ret < 0) { - ksft_test_result_fail("fork() failed\n"); - goto close_comm_pipes; - } else if (!ret) { - exit(fn(mem, size, &comm_pipes)); - } - - while (read(comm_pipes.child_ready[0], &buf, 1) != 1) - ; - - if (do_mprotect) { - /* - * mprotect() optimizations might try avoiding - * write-faults by directly mapping pages writable. - */ - ret = mprotect(mem, size, PROT_READ); - ret |= mprotect(mem, size, PROT_READ|PROT_WRITE); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - write(comm_pipes.parent_ready[1], "0", 1); - wait(&ret); - goto close_comm_pipes; - } - } - - /* Modify the page. */ - memset(mem, 0xff, size); - write(comm_pipes.parent_ready[1], "0", 1); - - wait(&ret); - if (WIFEXITED(ret)) - ret = WEXITSTATUS(ret); - else - ret = -EINVAL; - - ksft_test_result(!ret, "No leak from parent into child\n"); -close_comm_pipes: - close_comm_pipes(&comm_pipes); -} - -static void test_cow_in_parent(char *mem, size_t size) -{ - do_test_cow_in_parent(mem, size, false, child_memcmp_fn); -} - -static void test_cow_in_parent_mprotect(char *mem, size_t size) -{ - do_test_cow_in_parent(mem, size, true, child_memcmp_fn); -} - -static void test_vmsplice_in_child(char *mem, size_t size) -{ - do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn); -} - -static void test_vmsplice_in_child_mprotect(char *mem, size_t size) -{ - do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn); -} - -static void do_test_vmsplice_in_parent(char *mem, size_t size, - bool before_fork) -{ - struct iovec iov = { - .iov_base = mem, - .iov_len = size, - }; - ssize_t cur, total, transferred; - struct comm_pipes comm_pipes; - char *old, *new; - int ret, fds[2]; - char buf; - - old = malloc(size); - new = malloc(size); - - memcpy(old, mem, size); - - ret = setup_comm_pipes(&comm_pipes); - if (ret) { - ksft_test_result_fail("pipe() failed\n"); - goto free; - } - - if (pipe(fds) < 0) { - ksft_test_result_fail("pipe() failed\n"); - goto close_comm_pipes; - } - - if (before_fork) { - transferred = vmsplice(fds[1], &iov, 1, 0); - if (transferred <= 0) { - ksft_test_result_fail("vmsplice() failed\n"); - goto close_pipe; - } - } - - ret = fork(); - if (ret < 0) { - ksft_test_result_fail("fork() failed\n"); - goto close_pipe; - } else if (!ret) { - write(comm_pipes.child_ready[1], "0", 1); - while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) - ; - /* Modify page content in the child. */ - memset(mem, 0xff, size); - exit(0); - } - - if (!before_fork) { - transferred = vmsplice(fds[1], &iov, 1, 0); - if (transferred <= 0) { - ksft_test_result_fail("vmsplice() failed\n"); - wait(&ret); - goto close_pipe; - } - } - - while (read(comm_pipes.child_ready[0], &buf, 1) != 1) - ; - if (munmap(mem, size) < 0) { - ksft_test_result_fail("munmap() failed\n"); - goto close_pipe; - } - write(comm_pipes.parent_ready[1], "0", 1); - - /* Wait until the child is done writing. */ - wait(&ret); - if (!WIFEXITED(ret)) { - ksft_test_result_fail("wait() failed\n"); - goto close_pipe; - } - - /* See if we still read the old values. */ - for (total = 0; total < transferred; total += cur) { - cur = read(fds[0], new + total, transferred - total); - if (cur < 0) { - ksft_test_result_fail("read() failed\n"); - goto close_pipe; - } - } - - ksft_test_result(!memcmp(old, new, transferred), - "No leak from child into parent\n"); -close_pipe: - close(fds[0]); - close(fds[1]); -close_comm_pipes: - close_comm_pipes(&comm_pipes); -free: - free(old); - free(new); -} - -static void test_vmsplice_before_fork(char *mem, size_t size) -{ - do_test_vmsplice_in_parent(mem, size, true); -} - -static void test_vmsplice_after_fork(char *mem, size_t size) -{ - do_test_vmsplice_in_parent(mem, size, false); -} - -#ifdef LOCAL_CONFIG_HAVE_LIBURING -static void do_test_iouring(char *mem, size_t size, bool use_fork) -{ - struct comm_pipes comm_pipes; - struct io_uring_cqe *cqe; - struct io_uring_sqe *sqe; - struct io_uring ring; - ssize_t cur, total; - struct iovec iov; - char *buf, *tmp; - int ret, fd; - FILE *file; - - ret = setup_comm_pipes(&comm_pipes); - if (ret) { - ksft_test_result_fail("pipe() failed\n"); - return; - } - - file = tmpfile(); - if (!file) { - ksft_test_result_fail("tmpfile() failed\n"); - goto close_comm_pipes; - } - fd = fileno(file); - assert(fd); - - tmp = malloc(size); - if (!tmp) { - ksft_test_result_fail("malloc() failed\n"); - goto close_file; - } - - /* Skip on errors, as we might just lack kernel support. */ - ret = io_uring_queue_init(1, &ring, 0); - if (ret < 0) { - ksft_test_result_skip("io_uring_queue_init() failed\n"); - goto free_tmp; - } - - /* - * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN - * | FOLL_LONGTERM the range. - * - * Skip on errors, as we might just lack kernel support or might not - * have sufficient MEMLOCK permissions. - */ - iov.iov_base = mem; - iov.iov_len = size; - ret = io_uring_register_buffers(&ring, &iov, 1); - if (ret) { - ksft_test_result_skip("io_uring_register_buffers() failed\n"); - goto queue_exit; - } - - if (use_fork) { - /* - * fork() and keep the child alive until we're done. Note that - * we expect the pinned page to not get shared with the child. - */ - ret = fork(); - if (ret < 0) { - ksft_test_result_fail("fork() failed\n"); - goto unregister_buffers; - } else if (!ret) { - write(comm_pipes.child_ready[1], "0", 1); - while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) - ; - exit(0); - } - - while (read(comm_pipes.child_ready[0], &buf, 1) != 1) - ; - } else { - /* - * Map the page R/O into the page table. Enable softdirty - * tracking to stop the page from getting mapped R/W immediately - * again by mprotect() optimizations. Note that we don't have an - * easy way to test if that worked (the pagemap does not export - * if the page is mapped R/O vs. R/W). - */ - ret = mprotect(mem, size, PROT_READ); - clear_softdirty(); - ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - goto unregister_buffers; - } - } - - /* - * Modify the page and write page content as observed by the fixed - * buffer pin to the file so we can verify it. - */ - memset(mem, 0xff, size); - sqe = io_uring_get_sqe(&ring); - if (!sqe) { - ksft_test_result_fail("io_uring_get_sqe() failed\n"); - goto quit_child; - } - io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0); - - ret = io_uring_submit(&ring); - if (ret < 0) { - ksft_test_result_fail("io_uring_submit() failed\n"); - goto quit_child; - } - - ret = io_uring_wait_cqe(&ring, &cqe); - if (ret < 0) { - ksft_test_result_fail("io_uring_wait_cqe() failed\n"); - goto quit_child; - } - - if (cqe->res != size) { - ksft_test_result_fail("write_fixed failed\n"); - goto quit_child; - } - io_uring_cqe_seen(&ring, cqe); - - /* Read back the file content to the temporary buffer. */ - total = 0; - while (total < size) { - cur = pread(fd, tmp + total, size - total, total); - if (cur < 0) { - ksft_test_result_fail("pread() failed\n"); - goto quit_child; - } - total += cur; - } - - /* Finally, check if we read what we expected. */ - ksft_test_result(!memcmp(mem, tmp, size), - "Longterm R/W pin is reliable\n"); - -quit_child: - if (use_fork) { - write(comm_pipes.parent_ready[1], "0", 1); - wait(&ret); - } -unregister_buffers: - io_uring_unregister_buffers(&ring); -queue_exit: - io_uring_queue_exit(&ring); -free_tmp: - free(tmp); -close_file: - fclose(file); -close_comm_pipes: - close_comm_pipes(&comm_pipes); -} - -static void test_iouring_ro(char *mem, size_t size) -{ - do_test_iouring(mem, size, false); -} - -static void test_iouring_fork(char *mem, size_t size) -{ - do_test_iouring(mem, size, true); -} - -#endif /* LOCAL_CONFIG_HAVE_LIBURING */ - -enum ro_pin_test { - RO_PIN_TEST, - RO_PIN_TEST_SHARED, - RO_PIN_TEST_PREVIOUSLY_SHARED, - RO_PIN_TEST_RO_EXCLUSIVE, -}; - -static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test, - bool fast) -{ - struct pin_longterm_test args; - struct comm_pipes comm_pipes; - char *tmp, buf; - __u64 tmp_val; - int ret; - - if (gup_fd < 0) { - ksft_test_result_skip("gup_test not available\n"); - return; - } - - tmp = malloc(size); - if (!tmp) { - ksft_test_result_fail("malloc() failed\n"); - return; - } - - ret = setup_comm_pipes(&comm_pipes); - if (ret) { - ksft_test_result_fail("pipe() failed\n"); - goto free_tmp; - } - - switch (test) { - case RO_PIN_TEST: - break; - case RO_PIN_TEST_SHARED: - case RO_PIN_TEST_PREVIOUSLY_SHARED: - /* - * Share the pages with our child. As the pages are not pinned, - * this should just work. - */ - ret = fork(); - if (ret < 0) { - ksft_test_result_fail("fork() failed\n"); - goto close_comm_pipes; - } else if (!ret) { - write(comm_pipes.child_ready[1], "0", 1); - while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) - ; - exit(0); - } - - /* Wait until our child is ready. */ - while (read(comm_pipes.child_ready[0], &buf, 1) != 1) - ; - - if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) { - /* - * Tell the child to quit now and wait until it quit. - * The pages should now be mapped R/O into our page - * tables, but they are no longer shared. - */ - write(comm_pipes.parent_ready[1], "0", 1); - wait(&ret); - if (!WIFEXITED(ret)) - ksft_print_msg("[INFO] wait() failed\n"); - } - break; - case RO_PIN_TEST_RO_EXCLUSIVE: - /* - * Map the page R/O into the page table. Enable softdirty - * tracking to stop the page from getting mapped R/W immediately - * again by mprotect() optimizations. Note that we don't have an - * easy way to test if that worked (the pagemap does not export - * if the page is mapped R/O vs. R/W). - */ - ret = mprotect(mem, size, PROT_READ); - clear_softdirty(); - ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - goto close_comm_pipes; - } - break; - default: - assert(false); - } - - /* Take a R/O pin. This should trigger unsharing. */ - args.addr = (__u64)(uintptr_t)mem; - args.size = size; - args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0; - ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args); - if (ret) { - if (errno == EINVAL) - ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n"); - else - ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n"); - goto wait; - } - - /* Modify the page. */ - memset(mem, 0xff, size); - - /* - * Read back the content via the pin to the temporary buffer and - * test if we observed the modification. - */ - tmp_val = (__u64)(uintptr_t)tmp; - ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val); - if (ret) - ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n"); - else - ksft_test_result(!memcmp(mem, tmp, size), - "Longterm R/O pin is reliable\n"); - - ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP); - if (ret) - ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n"); -wait: - switch (test) { - case RO_PIN_TEST_SHARED: - write(comm_pipes.parent_ready[1], "0", 1); - wait(&ret); - if (!WIFEXITED(ret)) - ksft_print_msg("[INFO] wait() failed\n"); - break; - default: - break; - } -close_comm_pipes: - close_comm_pipes(&comm_pipes); -free_tmp: - free(tmp); -} - -static void test_ro_pin_on_shared(char *mem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false); -} - -static void test_ro_fast_pin_on_shared(char *mem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true); -} - -static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false); -} - -static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true); -} - -static void test_ro_pin_on_ro_exclusive(char *mem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false); -} - -static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true); -} - -typedef void (*test_fn)(char *mem, size_t size); - -static void do_run_with_base_page(test_fn fn, bool swapout) -{ - char *mem; - int ret; - - mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return; - } - - ret = madvise(mem, pagesize, MADV_NOHUGEPAGE); - /* Ignore if not around on a kernel. */ - if (ret && errno != EINVAL) { - ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); - goto munmap; - } - - /* Populate a base page. */ - memset(mem, 0, pagesize); - - if (swapout) { - madvise(mem, pagesize, MADV_PAGEOUT); - if (!pagemap_is_swapped(pagemap_fd, mem)) { - ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); - goto munmap; - } - } - - fn(mem, pagesize); -munmap: - munmap(mem, pagesize); -} - -static void run_with_base_page(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with base page\n", desc); - do_run_with_base_page(fn, false); -} - -static void run_with_base_page_swap(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc); - do_run_with_base_page(fn, true); -} - -enum thp_run { - THP_RUN_PMD, - THP_RUN_PMD_SWAPOUT, - THP_RUN_PTE, - THP_RUN_PTE_SWAPOUT, - THP_RUN_SINGLE_PTE, - THP_RUN_SINGLE_PTE_SWAPOUT, - THP_RUN_PARTIAL_MREMAP, - THP_RUN_PARTIAL_SHARED, -}; - -static void do_run_with_thp(test_fn fn, enum thp_run thp_run) -{ - char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED; - size_t size, mmap_size, mremap_size; - int ret; - - /* For alignment purposes, we need twice the thp size. */ - mmap_size = 2 * thpsize; - mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (mmap_mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return; - } - - /* We need a THP-aligned memory area. */ - mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); - - ret = madvise(mem, thpsize, MADV_HUGEPAGE); - if (ret) { - ksft_test_result_fail("MADV_HUGEPAGE failed\n"); - goto munmap; - } - - /* - * Try to populate a THP. Touch the first sub-page and test if we get - * another sub-page populated automatically. - */ - mem[0] = 0; - if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) { - ksft_test_result_skip("Did not get a THP populated\n"); - goto munmap; - } - memset(mem, 0, thpsize); - - size = thpsize; - switch (thp_run) { - case THP_RUN_PMD: - case THP_RUN_PMD_SWAPOUT: - break; - case THP_RUN_PTE: - case THP_RUN_PTE_SWAPOUT: - /* - * Trigger PTE-mapping the THP by temporarily mapping a single - * subpage R/O. - */ - ret = mprotect(mem + pagesize, pagesize, PROT_READ); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - goto munmap; - } - ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - goto munmap; - } - break; - case THP_RUN_SINGLE_PTE: - case THP_RUN_SINGLE_PTE_SWAPOUT: - /* - * Discard all but a single subpage of that PTE-mapped THP. What - * remains is a single PTE mapping a single subpage. - */ - ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED); - if (ret) { - ksft_test_result_fail("MADV_DONTNEED failed\n"); - goto munmap; - } - size = pagesize; - break; - case THP_RUN_PARTIAL_MREMAP: - /* - * Remap half of the THP. We need some new memory location - * for that. - */ - mremap_size = thpsize / 2; - mremap_mem = mmap(NULL, mremap_size, PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto munmap; - } - tmp = mremap(mem + mremap_size, mremap_size, mremap_size, - MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem); - if (tmp != mremap_mem) { - ksft_test_result_fail("mremap() failed\n"); - goto munmap; - } - size = mremap_size; - break; - case THP_RUN_PARTIAL_SHARED: - /* - * Share the first page of the THP with a child and quit the - * child. This will result in some parts of the THP never - * have been shared. - */ - ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK); - if (ret) { - ksft_test_result_fail("MADV_DONTFORK failed\n"); - goto munmap; - } - ret = fork(); - if (ret < 0) { - ksft_test_result_fail("fork() failed\n"); - goto munmap; - } else if (!ret) { - exit(0); - } - wait(&ret); - /* Allow for sharing all pages again. */ - ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK); - if (ret) { - ksft_test_result_fail("MADV_DOFORK failed\n"); - goto munmap; - } - break; - default: - assert(false); - } - - switch (thp_run) { - case THP_RUN_PMD_SWAPOUT: - case THP_RUN_PTE_SWAPOUT: - case THP_RUN_SINGLE_PTE_SWAPOUT: - madvise(mem, size, MADV_PAGEOUT); - if (!range_is_swapped(mem, size)) { - ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); - goto munmap; - } - break; - default: - break; - } - - fn(mem, size); -munmap: - munmap(mmap_mem, mmap_size); - if (mremap_mem != MAP_FAILED) - munmap(mremap_mem, mremap_size); -} - -static void run_with_thp(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with THP\n", desc); - do_run_with_thp(fn, THP_RUN_PMD); -} - -static void run_with_thp_swap(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc); - do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT); -} - -static void run_with_pte_mapped_thp(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc); - do_run_with_thp(fn, THP_RUN_PTE); -} - -static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc); - do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT); -} - -static void run_with_single_pte_of_thp(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc); - do_run_with_thp(fn, THP_RUN_SINGLE_PTE); -} - -static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc); - do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT); -} - -static void run_with_partial_mremap_thp(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc); - do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP); -} - -static void run_with_partial_shared_thp(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc); - do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED); -} - -static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize) -{ - int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; - char *mem, *dummy; - - ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc, - hugetlbsize / 1024); - - flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT; - - mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); - if (mem == MAP_FAILED) { - ksft_test_result_skip("need more free huge pages\n"); - return; - } - - /* Populate an huge page. */ - memset(mem, 0, hugetlbsize); - - /* - * We need a total of two hugetlb pages to handle COW/unsharing - * properly, otherwise we might get zapped by a SIGBUS. - */ - dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); - if (dummy == MAP_FAILED) { - ksft_test_result_skip("need more free huge pages\n"); - goto munmap; - } - munmap(dummy, hugetlbsize); - - fn(mem, hugetlbsize); -munmap: - munmap(mem, hugetlbsize); -} - -struct test_case { - const char *desc; - test_fn fn; -}; - -/* - * Test cases that are specific to anonymous pages: pages in private mappings - * that may get shared via COW during fork(). - */ -static const struct test_case anon_test_cases[] = { - /* - * Basic COW tests for fork() without any GUP. If we miss to break COW, - * either the child can observe modifications by the parent or the - * other way around. - */ - { - "Basic COW after fork()", - test_cow_in_parent, - }, - /* - * Basic test, but do an additional mprotect(PROT_READ)+ - * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. - */ - { - "Basic COW after fork() with mprotect() optimization", - test_cow_in_parent_mprotect, - }, - /* - * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If - * we miss to break COW, the child observes modifications by the parent. - * This is CVE-2020-29374 reported by Jann Horn. - */ - { - "vmsplice() + unmap in child", - test_vmsplice_in_child - }, - /* - * vmsplice() test, but do an additional mprotect(PROT_READ)+ - * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. - */ - { - "vmsplice() + unmap in child with mprotect() optimization", - test_vmsplice_in_child_mprotect - }, - /* - * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after - * fork(); modify in the child. If we miss to break COW, the parent - * observes modifications by the child. - */ - { - "vmsplice() before fork(), unmap in parent after fork()", - test_vmsplice_before_fork, - }, - /* - * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the - * child. If we miss to break COW, the parent observes modifications by - * the child. - */ - { - "vmsplice() + unmap in parent after fork()", - test_vmsplice_after_fork, - }, -#ifdef LOCAL_CONFIG_HAVE_LIBURING - /* - * Take a R/W longterm pin and then map the page R/O into the page - * table to trigger a write fault on next access. When modifying the - * page, the page content must be visible via the pin. - */ - { - "R/O-mapping a page registered as iouring fixed buffer", - test_iouring_ro, - }, - /* - * Take a R/W longterm pin and then fork() a child. When modifying the - * page, the page content must be visible via the pin. We expect the - * pinned page to not get shared with the child. - */ - { - "fork() with an iouring fixed buffer", - test_iouring_fork, - }, - -#endif /* LOCAL_CONFIG_HAVE_LIBURING */ - /* - * Take a R/O longterm pin on a R/O-mapped shared anonymous page. - * When modifying the page via the page table, the page content change - * must be visible via the pin. - */ - { - "R/O GUP pin on R/O-mapped shared page", - test_ro_pin_on_shared, - }, - /* Same as above, but using GUP-fast. */ - { - "R/O GUP-fast pin on R/O-mapped shared page", - test_ro_fast_pin_on_shared, - }, - /* - * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that - * was previously shared. When modifying the page via the page table, - * the page content change must be visible via the pin. - */ - { - "R/O GUP pin on R/O-mapped previously-shared page", - test_ro_pin_on_ro_previously_shared, - }, - /* Same as above, but using GUP-fast. */ - { - "R/O GUP-fast pin on R/O-mapped previously-shared page", - test_ro_fast_pin_on_ro_previously_shared, - }, - /* - * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page. - * When modifying the page via the page table, the page content change - * must be visible via the pin. - */ - { - "R/O GUP pin on R/O-mapped exclusive page", - test_ro_pin_on_ro_exclusive, - }, - /* Same as above, but using GUP-fast. */ - { - "R/O GUP-fast pin on R/O-mapped exclusive page", - test_ro_fast_pin_on_ro_exclusive, - }, -}; - -static void run_anon_test_case(struct test_case const *test_case) -{ - int i; - - run_with_base_page(test_case->fn, test_case->desc); - run_with_base_page_swap(test_case->fn, test_case->desc); - if (thpsize) { - run_with_thp(test_case->fn, test_case->desc); - run_with_thp_swap(test_case->fn, test_case->desc); - run_with_pte_mapped_thp(test_case->fn, test_case->desc); - run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc); - run_with_single_pte_of_thp(test_case->fn, test_case->desc); - run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc); - run_with_partial_mremap_thp(test_case->fn, test_case->desc); - run_with_partial_shared_thp(test_case->fn, test_case->desc); - } - for (i = 0; i < nr_hugetlbsizes; i++) - run_with_hugetlb(test_case->fn, test_case->desc, - hugetlbsizes[i]); -} - -static void run_anon_test_cases(void) -{ - int i; - - ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n"); - - for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++) - run_anon_test_case(&anon_test_cases[i]); -} - -static int tests_per_anon_test_case(void) -{ - int tests = 2 + nr_hugetlbsizes; - - if (thpsize) - tests += 8; - return tests; -} - -enum anon_thp_collapse_test { - ANON_THP_COLLAPSE_UNSHARED, - ANON_THP_COLLAPSE_FULLY_SHARED, - ANON_THP_COLLAPSE_LOWER_SHARED, - ANON_THP_COLLAPSE_UPPER_SHARED, -}; - -static void do_test_anon_thp_collapse(char *mem, size_t size, - enum anon_thp_collapse_test test) -{ - struct comm_pipes comm_pipes; - char buf; - int ret; - - ret = setup_comm_pipes(&comm_pipes); - if (ret) { - ksft_test_result_fail("pipe() failed\n"); - return; - } - - /* - * Trigger PTE-mapping the THP by temporarily mapping a single subpage - * R/O, such that we can try collapsing it later. - */ - ret = mprotect(mem + pagesize, pagesize, PROT_READ); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - goto close_comm_pipes; - } - ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - goto close_comm_pipes; - } - - switch (test) { - case ANON_THP_COLLAPSE_UNSHARED: - /* Collapse before actually COW-sharing the page. */ - ret = madvise(mem, size, MADV_COLLAPSE); - if (ret) { - ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", - strerror(errno)); - goto close_comm_pipes; - } - break; - case ANON_THP_COLLAPSE_FULLY_SHARED: - /* COW-share the full PTE-mapped THP. */ - break; - case ANON_THP_COLLAPSE_LOWER_SHARED: - /* Don't COW-share the upper part of the THP. */ - ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK); - if (ret) { - ksft_test_result_fail("MADV_DONTFORK failed\n"); - goto close_comm_pipes; - } - break; - case ANON_THP_COLLAPSE_UPPER_SHARED: - /* Don't COW-share the lower part of the THP. */ - ret = madvise(mem, size / 2, MADV_DONTFORK); - if (ret) { - ksft_test_result_fail("MADV_DONTFORK failed\n"); - goto close_comm_pipes; - } - break; - default: - assert(false); - } - - ret = fork(); - if (ret < 0) { - ksft_test_result_fail("fork() failed\n"); - goto close_comm_pipes; - } else if (!ret) { - switch (test) { - case ANON_THP_COLLAPSE_UNSHARED: - case ANON_THP_COLLAPSE_FULLY_SHARED: - exit(child_memcmp_fn(mem, size, &comm_pipes)); - break; - case ANON_THP_COLLAPSE_LOWER_SHARED: - exit(child_memcmp_fn(mem, size / 2, &comm_pipes)); - break; - case ANON_THP_COLLAPSE_UPPER_SHARED: - exit(child_memcmp_fn(mem + size / 2, size / 2, - &comm_pipes)); - break; - default: - assert(false); - } - } - - while (read(comm_pipes.child_ready[0], &buf, 1) != 1) - ; - - switch (test) { - case ANON_THP_COLLAPSE_UNSHARED: - break; - case ANON_THP_COLLAPSE_UPPER_SHARED: - case ANON_THP_COLLAPSE_LOWER_SHARED: - /* - * Revert MADV_DONTFORK such that we merge the VMAs and are - * able to actually collapse. - */ - ret = madvise(mem, size, MADV_DOFORK); - if (ret) { - ksft_test_result_fail("MADV_DOFORK failed\n"); - write(comm_pipes.parent_ready[1], "0", 1); - wait(&ret); - goto close_comm_pipes; - } - /* FALLTHROUGH */ - case ANON_THP_COLLAPSE_FULLY_SHARED: - /* Collapse before anyone modified the COW-shared page. */ - ret = madvise(mem, size, MADV_COLLAPSE); - if (ret) { - ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", - strerror(errno)); - write(comm_pipes.parent_ready[1], "0", 1); - wait(&ret); - goto close_comm_pipes; - } - break; - default: - assert(false); - } - - /* Modify the page. */ - memset(mem, 0xff, size); - write(comm_pipes.parent_ready[1], "0", 1); - - wait(&ret); - if (WIFEXITED(ret)) - ret = WEXITSTATUS(ret); - else - ret = -EINVAL; - - ksft_test_result(!ret, "No leak from parent into child\n"); -close_comm_pipes: - close_comm_pipes(&comm_pipes); -} - -static void test_anon_thp_collapse_unshared(char *mem, size_t size) -{ - do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED); -} - -static void test_anon_thp_collapse_fully_shared(char *mem, size_t size) -{ - do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED); -} - -static void test_anon_thp_collapse_lower_shared(char *mem, size_t size) -{ - do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED); -} - -static void test_anon_thp_collapse_upper_shared(char *mem, size_t size) -{ - do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED); -} - -/* - * Test cases that are specific to anonymous THP: pages in private mappings - * that may get shared via COW during fork(). - */ -static const struct test_case anon_thp_test_cases[] = { - /* - * Basic COW test for fork() without any GUP when collapsing a THP - * before fork(). - * - * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place - * collapse") might easily get COW handling wrong when not collapsing - * exclusivity information properly. - */ - { - "Basic COW after fork() when collapsing before fork()", - test_anon_thp_collapse_unshared, - }, - /* Basic COW test, but collapse after COW-sharing a full THP. */ - { - "Basic COW after fork() when collapsing after fork() (fully shared)", - test_anon_thp_collapse_fully_shared, - }, - /* - * Basic COW test, but collapse after COW-sharing the lower half of a - * THP. - */ - { - "Basic COW after fork() when collapsing after fork() (lower shared)", - test_anon_thp_collapse_lower_shared, - }, - /* - * Basic COW test, but collapse after COW-sharing the upper half of a - * THP. - */ - { - "Basic COW after fork() when collapsing after fork() (upper shared)", - test_anon_thp_collapse_upper_shared, - }, -}; - -static void run_anon_thp_test_cases(void) -{ - int i; - - if (!thpsize) - return; - - ksft_print_msg("[INFO] Anonymous THP tests\n"); - - for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) { - struct test_case const *test_case = &anon_thp_test_cases[i]; - - ksft_print_msg("[RUN] %s\n", test_case->desc); - do_run_with_thp(test_case->fn, THP_RUN_PMD); - } -} - -static int tests_per_anon_thp_test_case(void) -{ - return thpsize ? 1 : 0; -} - -typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size); - -static void test_cow(char *mem, const char *smem, size_t size) -{ - char *old = malloc(size); - - /* Backup the original content. */ - memcpy(old, smem, size); - - /* Modify the page. */ - memset(mem, 0xff, size); - - /* See if we still read the old values via the other mapping. */ - ksft_test_result(!memcmp(smem, old, size), - "Other mapping not modified\n"); - free(old); -} - -static void test_ro_pin(char *mem, const char *smem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST, false); -} - -static void test_ro_fast_pin(char *mem, const char *smem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST, true); -} - -static void run_with_zeropage(non_anon_test_fn fn, const char *desc) -{ - char *mem, *smem, tmp; - - ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc); - - mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANON, -1, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return; - } - - smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto munmap; - } - - /* Read from the page to populate the shared zeropage. */ - tmp = *mem + *smem; - asm volatile("" : "+r" (tmp)); - - fn(mem, smem, pagesize); -munmap: - munmap(mem, pagesize); - if (smem != MAP_FAILED) - munmap(smem, pagesize); -} - -static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) -{ - char *mem, *smem, *mmap_mem, *mmap_smem, tmp; - size_t mmap_size; - int ret; - - ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc); - - if (!has_huge_zeropage) { - ksft_test_result_skip("Huge zeropage not enabled\n"); - return; - } - - /* For alignment purposes, we need twice the thp size. */ - mmap_size = 2 * thpsize; - mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (mmap_mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return; - } - mmap_smem = mmap(NULL, mmap_size, PROT_READ, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (mmap_smem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto munmap; - } - - /* We need a THP-aligned memory area. */ - mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); - smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1)); - - ret = madvise(mem, thpsize, MADV_HUGEPAGE); - ret |= madvise(smem, thpsize, MADV_HUGEPAGE); - if (ret) { - ksft_test_result_fail("MADV_HUGEPAGE failed\n"); - goto munmap; - } - - /* - * Read from the memory to populate the huge shared zeropage. Read from - * the first sub-page and test if we get another sub-page populated - * automatically. - */ - tmp = *mem + *smem; - asm volatile("" : "+r" (tmp)); - if (!pagemap_is_populated(pagemap_fd, mem + pagesize) || - !pagemap_is_populated(pagemap_fd, smem + pagesize)) { - ksft_test_result_skip("Did not get THPs populated\n"); - goto munmap; - } - - fn(mem, smem, thpsize); -munmap: - munmap(mmap_mem, mmap_size); - if (mmap_smem != MAP_FAILED) - munmap(mmap_smem, mmap_size); -} - -static void run_with_memfd(non_anon_test_fn fn, const char *desc) -{ - char *mem, *smem, tmp; - int fd; - - ksft_print_msg("[RUN] %s ... with memfd\n", desc); - - fd = memfd_create("test", 0); - if (fd < 0) { - ksft_test_result_fail("memfd_create() failed\n"); - return; - } - - /* File consists of a single page filled with zeroes. */ - if (fallocate(fd, 0, 0, pagesize)) { - ksft_test_result_fail("fallocate() failed\n"); - goto close; - } - - /* Create a private mapping of the memfd. */ - mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto close; - } - smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto munmap; - } - - /* Fault the page in. */ - tmp = *mem + *smem; - asm volatile("" : "+r" (tmp)); - - fn(mem, smem, pagesize); -munmap: - munmap(mem, pagesize); - if (smem != MAP_FAILED) - munmap(smem, pagesize); -close: - close(fd); -} - -static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) -{ - char *mem, *smem, tmp; - FILE *file; - int fd; - - ksft_print_msg("[RUN] %s ... with tmpfile\n", desc); - - file = tmpfile(); - if (!file) { - ksft_test_result_fail("tmpfile() failed\n"); - return; - } - - fd = fileno(file); - if (fd < 0) { - ksft_test_result_skip("fileno() failed\n"); - return; - } - - /* File consists of a single page filled with zeroes. */ - if (fallocate(fd, 0, 0, pagesize)) { - ksft_test_result_fail("fallocate() failed\n"); - goto close; - } - - /* Create a private mapping of the memfd. */ - mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto close; - } - smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto munmap; - } - - /* Fault the page in. */ - tmp = *mem + *smem; - asm volatile("" : "+r" (tmp)); - - fn(mem, smem, pagesize); -munmap: - munmap(mem, pagesize); - if (smem != MAP_FAILED) - munmap(smem, pagesize); -close: - fclose(file); -} - -static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, - size_t hugetlbsize) -{ - int flags = MFD_HUGETLB; - char *mem, *smem, tmp; - int fd; - - ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc, - hugetlbsize / 1024); - - flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT; - - fd = memfd_create("test", flags); - if (fd < 0) { - ksft_test_result_skip("memfd_create() failed\n"); - return; - } - - /* File consists of a single page filled with zeroes. */ - if (fallocate(fd, 0, 0, hugetlbsize)) { - ksft_test_result_skip("need more free huge pages\n"); - goto close; - } - - /* Create a private mapping of the memfd. */ - mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, - 0); - if (mem == MAP_FAILED) { - ksft_test_result_skip("need more free huge pages\n"); - goto close; - } - smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto munmap; - } - - /* Fault the page in. */ - tmp = *mem + *smem; - asm volatile("" : "+r" (tmp)); - - fn(mem, smem, hugetlbsize); -munmap: - munmap(mem, hugetlbsize); - if (mem != MAP_FAILED) - munmap(smem, hugetlbsize); -close: - close(fd); -} - -struct non_anon_test_case { - const char *desc; - non_anon_test_fn fn; -}; - -/* - * Test cases that target any pages in private mappings that are not anonymous: - * pages that may get shared via COW ndependent of fork(). This includes - * the shared zeropage(s), pagecache pages, ... - */ -static const struct non_anon_test_case non_anon_test_cases[] = { - /* - * Basic COW test without any GUP. If we miss to break COW, changes are - * visible via other private/shared mappings. - */ - { - "Basic COW", - test_cow, - }, - /* - * Take a R/O longterm pin. When modifying the page via the page table, - * the page content change must be visible via the pin. - */ - { - "R/O longterm GUP pin", - test_ro_pin, - }, - /* Same as above, but using GUP-fast. */ - { - "R/O longterm GUP-fast pin", - test_ro_fast_pin, - }, -}; - -static void run_non_anon_test_case(struct non_anon_test_case const *test_case) -{ - int i; - - run_with_zeropage(test_case->fn, test_case->desc); - run_with_memfd(test_case->fn, test_case->desc); - run_with_tmpfile(test_case->fn, test_case->desc); - if (thpsize) - run_with_huge_zeropage(test_case->fn, test_case->desc); - for (i = 0; i < nr_hugetlbsizes; i++) - run_with_memfd_hugetlb(test_case->fn, test_case->desc, - hugetlbsizes[i]); -} - -static void run_non_anon_test_cases(void) -{ - int i; - - ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n"); - - for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++) - run_non_anon_test_case(&non_anon_test_cases[i]); -} - -static int tests_per_non_anon_test_case(void) -{ - int tests = 3 + nr_hugetlbsizes; - - if (thpsize) - tests += 1; - return tests; -} - -int main(int argc, char **argv) -{ - int err; - - pagesize = getpagesize(); - detect_thpsize(); - detect_hugetlbsizes(); - detect_huge_zeropage(); - - ksft_print_header(); - ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() + - ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() + - ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case()); - - gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); - pagemap_fd = open("/proc/self/pagemap", O_RDONLY); - if (pagemap_fd < 0) - ksft_exit_fail_msg("opening pagemap failed\n"); - - run_anon_test_cases(); - run_anon_thp_test_cases(); - run_non_anon_test_cases(); - - err = ksft_get_fail_cnt(); - if (err) - ksft_exit_fail_msg("%d out of %d tests failed\n", - err, ksft_test_num()); - return ksft_exit_pass(); -} diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c deleted file mode 100644 index e43879291dac..000000000000 --- a/tools/testing/selftests/vm/gup_test.c +++ /dev/null @@ -1,271 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "../kselftest.h" - -#include "util.h" - -#define MB (1UL << 20) - -/* Just the flags we need, copied from mm.h: */ -#define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_TOUCH 0x02 /* mark page accessed */ - -#define GUP_TEST_FILE "/sys/kernel/debug/gup_test" - -static unsigned long cmd = GUP_FAST_BENCHMARK; -static int gup_fd, repeats = 1; -static unsigned long size = 128 * MB; -/* Serialize prints */ -static pthread_mutex_t print_mutex = PTHREAD_MUTEX_INITIALIZER; - -static char *cmd_to_str(unsigned long cmd) -{ - switch (cmd) { - case GUP_FAST_BENCHMARK: - return "GUP_FAST_BENCHMARK"; - case PIN_FAST_BENCHMARK: - return "PIN_FAST_BENCHMARK"; - case PIN_LONGTERM_BENCHMARK: - return "PIN_LONGTERM_BENCHMARK"; - case GUP_BASIC_TEST: - return "GUP_BASIC_TEST"; - case PIN_BASIC_TEST: - return "PIN_BASIC_TEST"; - case DUMP_USER_PAGES_TEST: - return "DUMP_USER_PAGES_TEST"; - } - return "Unknown command"; -} - -void *gup_thread(void *data) -{ - struct gup_test gup = *(struct gup_test *)data; - int i; - - /* Only report timing information on the *_BENCHMARK commands: */ - if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) || - (cmd == PIN_LONGTERM_BENCHMARK)) { - for (i = 0; i < repeats; i++) { - gup.size = size; - if (ioctl(gup_fd, cmd, &gup)) - perror("ioctl"), exit(1); - - pthread_mutex_lock(&print_mutex); - printf("%s: Time: get:%lld put:%lld us", - cmd_to_str(cmd), gup.get_delta_usec, - gup.put_delta_usec); - if (gup.size != size) - printf(", truncated (size: %lld)", gup.size); - printf("\n"); - pthread_mutex_unlock(&print_mutex); - } - } else { - gup.size = size; - if (ioctl(gup_fd, cmd, &gup)) { - perror("ioctl"); - exit(1); - } - - pthread_mutex_lock(&print_mutex); - printf("%s: done\n", cmd_to_str(cmd)); - if (gup.size != size) - printf("Truncated (size: %lld)\n", gup.size); - pthread_mutex_unlock(&print_mutex); - } - - return NULL; -} - -int main(int argc, char **argv) -{ - struct gup_test gup = { 0 }; - int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret; - int flags = MAP_PRIVATE, touch = 0; - char *file = "/dev/zero"; - pthread_t *tid; - char *p; - - while ((opt = getopt(argc, argv, "m:r:n:F:f:abcj:tTLUuwWSHpz")) != -1) { - switch (opt) { - case 'a': - cmd = PIN_FAST_BENCHMARK; - break; - case 'b': - cmd = PIN_BASIC_TEST; - break; - case 'L': - cmd = PIN_LONGTERM_BENCHMARK; - break; - case 'c': - cmd = DUMP_USER_PAGES_TEST; - /* - * Dump page 0 (index 1). May be overridden later, by - * user's non-option arguments. - * - * .which_pages is zero-based, so that zero can mean "do - * nothing". - */ - gup.which_pages[0] = 1; - break; - case 'p': - /* works only with DUMP_USER_PAGES_TEST */ - gup.test_flags |= GUP_TEST_FLAG_DUMP_PAGES_USE_PIN; - break; - case 'F': - /* strtol, so you can pass flags in hex form */ - gup.gup_flags = strtol(optarg, 0, 0); - break; - case 'j': - nthreads = atoi(optarg); - break; - case 'm': - size = atoi(optarg) * MB; - break; - case 'r': - repeats = atoi(optarg); - break; - case 'n': - nr_pages = atoi(optarg); - break; - case 't': - thp = 1; - break; - case 'T': - thp = 0; - break; - case 'U': - cmd = GUP_BASIC_TEST; - break; - case 'u': - cmd = GUP_FAST_BENCHMARK; - break; - case 'w': - write = 1; - break; - case 'W': - write = 0; - break; - case 'f': - file = optarg; - break; - case 'S': - flags &= ~MAP_PRIVATE; - flags |= MAP_SHARED; - break; - case 'H': - flags |= (MAP_HUGETLB | MAP_ANONYMOUS); - break; - case 'z': - /* fault pages in gup, do not fault in userland */ - touch = 1; - break; - default: - return -1; - } - } - - if (optind < argc) { - int extra_arg_count = 0; - /* - * For example: - * - * ./gup_test -c 0 1 0x1001 - * - * ...to dump pages 0, 1, and 4097 - */ - - while ((optind < argc) && - (extra_arg_count < GUP_TEST_MAX_PAGES_TO_DUMP)) { - /* - * Do the 1-based indexing here, so that the user can - * use normal 0-based indexing on the command line. - */ - long page_index = strtol(argv[optind], 0, 0) + 1; - - gup.which_pages[extra_arg_count] = page_index; - extra_arg_count++; - optind++; - } - } - - filed = open(file, O_RDWR|O_CREAT); - if (filed < 0) { - perror("open"); - exit(filed); - } - - gup.nr_pages_per_call = nr_pages; - if (write) - gup.gup_flags |= FOLL_WRITE; - - gup_fd = open(GUP_TEST_FILE, O_RDWR); - if (gup_fd == -1) { - switch (errno) { - case EACCES: - if (getuid()) - printf("Please run this test as root\n"); - break; - case ENOENT: - if (opendir("/sys/kernel/debug") == NULL) { - printf("mount debugfs at /sys/kernel/debug\n"); - break; - } - printf("check if CONFIG_GUP_TEST is enabled in kernel config\n"); - break; - default: - perror("failed to open " GUP_TEST_FILE); - break; - } - exit(KSFT_SKIP); - } - - p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0); - if (p == MAP_FAILED) { - perror("mmap"); - exit(1); - } - gup.addr = (unsigned long)p; - - if (thp == 1) - madvise(p, size, MADV_HUGEPAGE); - else if (thp == 0) - madvise(p, size, MADV_NOHUGEPAGE); - - /* - * FOLL_TOUCH, in gup_test, is used as an either/or case: either - * fault pages in from the kernel via FOLL_TOUCH, or fault them - * in here, from user space. This allows comparison of performance - * between those two cases. - */ - if (touch) { - gup.gup_flags |= FOLL_TOUCH; - } else { - for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE) - p[0] = 0; - } - - tid = malloc(sizeof(pthread_t) * nthreads); - assert(tid); - for (i = 0; i < nthreads; i++) { - ret = pthread_create(&tid[i], NULL, gup_thread, &gup); - assert(ret == 0); - } - for (i = 0; i < nthreads; i++) { - ret = pthread_join(tid[i], NULL); - assert(ret == 0); - } - free(tid); - - return 0; -} diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c deleted file mode 100644 index 4adaad1b822f..000000000000 --- a/tools/testing/selftests/vm/hmm-tests.c +++ /dev/null @@ -1,2054 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * HMM stands for Heterogeneous Memory Management, it is a helper layer inside - * the linux kernel to help device drivers mirror a process address space in - * the device. This allows the device to use the same address space which - * makes communication and data exchange a lot easier. - * - * This framework's sole purpose is to exercise various code paths inside - * the kernel to make sure that HMM performs as expected and to flush out any - * bugs. - */ - -#include "../kselftest_harness.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -/* - * This is a private UAPI to the kernel test module so it isn't exported - * in the usual include/uapi/... directory. - */ -#include -#include - -struct hmm_buffer { - void *ptr; - void *mirror; - unsigned long size; - int fd; - uint64_t cpages; - uint64_t faults; -}; - -enum { - HMM_PRIVATE_DEVICE_ONE, - HMM_PRIVATE_DEVICE_TWO, - HMM_COHERENCE_DEVICE_ONE, - HMM_COHERENCE_DEVICE_TWO, -}; - -#define TWOMEG (1 << 21) -#define HMM_BUFFER_SIZE (1024 << 12) -#define HMM_PATH_MAX 64 -#define NTIMES 10 - -#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) -/* Just the flags we need, copied from mm.h: */ -#define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite */ - -FIXTURE(hmm) -{ - int fd; - unsigned int page_size; - unsigned int page_shift; -}; - -FIXTURE_VARIANT(hmm) -{ - int device_number; -}; - -FIXTURE_VARIANT_ADD(hmm, hmm_device_private) -{ - .device_number = HMM_PRIVATE_DEVICE_ONE, -}; - -FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent) -{ - .device_number = HMM_COHERENCE_DEVICE_ONE, -}; - -FIXTURE(hmm2) -{ - int fd0; - int fd1; - unsigned int page_size; - unsigned int page_shift; -}; - -FIXTURE_VARIANT(hmm2) -{ - int device_number0; - int device_number1; -}; - -FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private) -{ - .device_number0 = HMM_PRIVATE_DEVICE_ONE, - .device_number1 = HMM_PRIVATE_DEVICE_TWO, -}; - -FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent) -{ - .device_number0 = HMM_COHERENCE_DEVICE_ONE, - .device_number1 = HMM_COHERENCE_DEVICE_TWO, -}; - -static int hmm_open(int unit) -{ - char pathname[HMM_PATH_MAX]; - int fd; - - snprintf(pathname, sizeof(pathname), "/dev/hmm_dmirror%d", unit); - fd = open(pathname, O_RDWR, 0); - if (fd < 0) - fprintf(stderr, "could not open hmm dmirror driver (%s)\n", - pathname); - return fd; -} - -static bool hmm_is_coherent_type(int dev_num) -{ - return (dev_num >= HMM_COHERENCE_DEVICE_ONE); -} - -FIXTURE_SETUP(hmm) -{ - self->page_size = sysconf(_SC_PAGE_SIZE); - self->page_shift = ffs(self->page_size) - 1; - - self->fd = hmm_open(variant->device_number); - if (self->fd < 0 && hmm_is_coherent_type(variant->device_number)) - SKIP(exit(0), "DEVICE_COHERENT not available"); - ASSERT_GE(self->fd, 0); -} - -FIXTURE_SETUP(hmm2) -{ - self->page_size = sysconf(_SC_PAGE_SIZE); - self->page_shift = ffs(self->page_size) - 1; - - self->fd0 = hmm_open(variant->device_number0); - if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0)) - SKIP(exit(0), "DEVICE_COHERENT not available"); - ASSERT_GE(self->fd0, 0); - self->fd1 = hmm_open(variant->device_number1); - ASSERT_GE(self->fd1, 0); -} - -FIXTURE_TEARDOWN(hmm) -{ - int ret = close(self->fd); - - ASSERT_EQ(ret, 0); - self->fd = -1; -} - -FIXTURE_TEARDOWN(hmm2) -{ - int ret = close(self->fd0); - - ASSERT_EQ(ret, 0); - self->fd0 = -1; - - ret = close(self->fd1); - ASSERT_EQ(ret, 0); - self->fd1 = -1; -} - -static int hmm_dmirror_cmd(int fd, - unsigned long request, - struct hmm_buffer *buffer, - unsigned long npages) -{ - struct hmm_dmirror_cmd cmd; - int ret; - - /* Simulate a device reading system memory. */ - cmd.addr = (__u64)buffer->ptr; - cmd.ptr = (__u64)buffer->mirror; - cmd.npages = npages; - - for (;;) { - ret = ioctl(fd, request, &cmd); - if (ret == 0) - break; - if (errno == EINTR) - continue; - return -errno; - } - buffer->cpages = cmd.cpages; - buffer->faults = cmd.faults; - - return 0; -} - -static void hmm_buffer_free(struct hmm_buffer *buffer) -{ - if (buffer == NULL) - return; - - if (buffer->ptr) - munmap(buffer->ptr, buffer->size); - free(buffer->mirror); - free(buffer); -} - -/* - * Create a temporary file that will be deleted on close. - */ -static int hmm_create_file(unsigned long size) -{ - char path[HMM_PATH_MAX]; - int fd; - - strcpy(path, "/tmp"); - fd = open(path, O_TMPFILE | O_EXCL | O_RDWR, 0600); - if (fd >= 0) { - int r; - - do { - r = ftruncate(fd, size); - } while (r == -1 && errno == EINTR); - if (!r) - return fd; - close(fd); - } - return -1; -} - -/* - * Return a random unsigned number. - */ -static unsigned int hmm_random(void) -{ - static int fd = -1; - unsigned int r; - - if (fd < 0) { - fd = open("/dev/urandom", O_RDONLY); - if (fd < 0) { - fprintf(stderr, "%s:%d failed to open /dev/urandom\n", - __FILE__, __LINE__); - return ~0U; - } - } - read(fd, &r, sizeof(r)); - return r; -} - -static void hmm_nanosleep(unsigned int n) -{ - struct timespec t; - - t.tv_sec = 0; - t.tv_nsec = n; - nanosleep(&t, NULL); -} - -static int hmm_migrate_sys_to_dev(int fd, - struct hmm_buffer *buffer, - unsigned long npages) -{ - return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages); -} - -static int hmm_migrate_dev_to_sys(int fd, - struct hmm_buffer *buffer, - unsigned long npages) -{ - return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages); -} - -/* - * Simple NULL test of device open/close. - */ -TEST_F(hmm, open_close) -{ -} - -/* - * Read private anonymous memory. - */ -TEST_F(hmm, anon_read) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - int val; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* - * Initialize buffer in system memory but leave the first two pages - * zero (pte_none and pfn_zero). - */ - i = 2 * self->page_size / sizeof(*ptr); - for (ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Set buffer permission to read-only. */ - ret = mprotect(buffer->ptr, size, PROT_READ); - ASSERT_EQ(ret, 0); - - /* Populate the CPU page table with a special zero page. */ - val = *(int *)(buffer->ptr + self->page_size); - ASSERT_EQ(val, 0); - - /* Simulate a device reading system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device read. */ - ptr = buffer->mirror; - for (i = 0; i < 2 * self->page_size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], 0); - for (; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Read private anonymous memory which has been protected with - * mprotect() PROT_NONE. - */ -TEST_F(hmm, anon_read_prot) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Initialize mirror buffer so we can verify it isn't written. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = -i; - - /* Protect buffer from reading. */ - ret = mprotect(buffer->ptr, size, PROT_NONE); - ASSERT_EQ(ret, 0); - - /* Simulate a device reading system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); - ASSERT_EQ(ret, -EFAULT); - - /* Allow CPU to read the buffer so we can check it. */ - ret = mprotect(buffer->ptr, size, PROT_READ); - ASSERT_EQ(ret, 0); - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - hmm_buffer_free(buffer); -} - -/* - * Write private anonymous memory. - */ -TEST_F(hmm, anon_write) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Write private anonymous memory which has been protected with - * mprotect() PROT_READ. - */ -TEST_F(hmm, anon_write_prot) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Simulate a device reading a zero page of memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, 1); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, 1); - ASSERT_EQ(buffer->faults, 1); - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, -EPERM); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], 0); - - /* Now allow writing and see that the zero page is replaced. */ - ret = mprotect(buffer->ptr, size, PROT_WRITE | PROT_READ); - ASSERT_EQ(ret, 0); - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Check that a device writing an anonymous private mapping - * will copy-on-write if a child process inherits the mapping. - */ -TEST_F(hmm, anon_write_child) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - pid_t pid; - int child_fd; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer->ptr so we can tell if it is written. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = -i; - - pid = fork(); - if (pid == -1) - ASSERT_EQ(pid, 0); - if (pid != 0) { - waitpid(pid, &ret, 0); - ASSERT_EQ(WIFEXITED(ret), 1); - - /* Check that the parent's buffer did not change. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - return; - } - - /* Check that we see the parent's values. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - /* The child process needs its own mirror to its own mm. */ - child_fd = hmm_open(0); - ASSERT_GE(child_fd, 0); - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - close(child_fd); - exit(0); -} - -/* - * Check that a device writing an anonymous shared mapping - * will not copy-on-write if a child process inherits the mapping. - */ -TEST_F(hmm, anon_write_child_shared) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - pid_t pid; - int child_fd; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer->ptr so we can tell if it is written. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = -i; - - pid = fork(); - if (pid == -1) - ASSERT_EQ(pid, 0); - if (pid != 0) { - waitpid(pid, &ret, 0); - ASSERT_EQ(WIFEXITED(ret), 1); - - /* Check that the parent's buffer did change. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - return; - } - - /* Check that we see the parent's values. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - /* The child process needs its own mirror to its own mm. */ - child_fd = hmm_open(0); - ASSERT_GE(child_fd, 0); - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - close(child_fd); - exit(0); -} - -/* - * Write private anonymous huge page. - */ -TEST_F(hmm, anon_write_huge) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - void *old_ptr; - void *map; - int *ptr; - int ret; - - size = 2 * TWOMEG; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - size = TWOMEG; - npages = size >> self->page_shift; - map = (void *)ALIGN((uintptr_t)buffer->ptr, size); - ret = madvise(map, size, MADV_HUGEPAGE); - ASSERT_EQ(ret, 0); - old_ptr = buffer->ptr; - buffer->ptr = map; - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - buffer->ptr = old_ptr; - hmm_buffer_free(buffer); -} - -/* - * Read numeric data from raw and tagged kernel status files. Used to read - * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag). - */ -static long file_read_ulong(char *file, const char *tag) -{ - int fd; - char buf[2048]; - int len; - char *p, *q; - long val; - - fd = open(file, O_RDONLY); - if (fd < 0) { - /* Error opening the file */ - return -1; - } - - len = read(fd, buf, sizeof(buf)); - close(fd); - if (len < 0) { - /* Error in reading the file */ - return -1; - } - if (len == sizeof(buf)) { - /* Error file is too large */ - return -1; - } - buf[len] = '\0'; - - /* Search for a tag if provided */ - if (tag) { - p = strstr(buf, tag); - if (!p) - return -1; /* looks like the line we want isn't there */ - p += strlen(tag); - } else - p = buf; - - val = strtol(p, &q, 0); - if (*q != ' ') { - /* Error parsing the file */ - return -1; - } - - return val; -} - -/* - * Write huge TLBFS page. - */ -TEST_F(hmm, anon_write_hugetlbfs) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long default_hsize; - unsigned long i; - int *ptr; - int ret; - - default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); - if (default_hsize < 0 || default_hsize*1024 < default_hsize) - SKIP(return, "Huge page size could not be determined"); - default_hsize = default_hsize*1024; /* KB to B */ - - size = ALIGN(TWOMEG, default_hsize); - npages = size >> self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, - -1, 0); - if (buffer->ptr == MAP_FAILED) { - free(buffer); - SKIP(return, "Huge page could not be allocated"); - } - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - munmap(buffer->ptr, buffer->size); - buffer->ptr = NULL; - hmm_buffer_free(buffer); -} - -/* - * Read mmap'ed file memory. - */ -TEST_F(hmm, file_read) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - int fd; - ssize_t len; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - fd = hmm_create_file(size); - ASSERT_GE(fd, 0); - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = fd; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - /* Write initial contents of the file. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - len = pwrite(fd, buffer->mirror, size, 0); - ASSERT_EQ(len, size); - memset(buffer->mirror, 0, size); - - buffer->ptr = mmap(NULL, size, - PROT_READ, - MAP_SHARED, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Simulate a device reading system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Write mmap'ed file memory. - */ -TEST_F(hmm, file_write) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - int fd; - ssize_t len; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - fd = hmm_create_file(size); - ASSERT_GE(fd, 0); - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = fd; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_SHARED, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Check that the device also wrote the file. */ - len = pread(fd, buffer->mirror, size, 0); - ASSERT_EQ(len, size); - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Migrate anonymous memory to device private memory. - */ -TEST_F(hmm, migrate) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Migrate memory to device. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Migrate anonymous memory to device private memory and fault some of it back - * to system memory, then try migrating the resulting mix of system and device - * private memory to the device. - */ -TEST_F(hmm, migrate_fault) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Migrate memory to device. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Fault half the pages back to system memory and check them. */ - for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) - ASSERT_EQ(ptr[i], i); - - /* Migrate memory to the device again. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -TEST_F(hmm, migrate_release) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Migrate memory to device. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Release device memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages); - ASSERT_EQ(ret, 0); - - /* Fault pages back to system memory and check them. */ - for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Migrate anonymous shared memory to device private memory. - */ -TEST_F(hmm, migrate_shared) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Migrate memory to device. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, -ENOENT); - - hmm_buffer_free(buffer); -} - -/* - * Try to migrate various memory types to device private memory. - */ -TEST_F(hmm2, migrate_mixed) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - int *ptr; - unsigned char *p; - int ret; - int val; - - npages = 6; - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - /* Reserve a range of addresses. */ - buffer->ptr = mmap(NULL, size, - PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - p = buffer->ptr; - - /* Migrating a protected area should be an error. */ - ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); - ASSERT_EQ(ret, -EINVAL); - - /* Punch a hole after the first page address. */ - ret = munmap(buffer->ptr + self->page_size, self->page_size); - ASSERT_EQ(ret, 0); - - /* We expect an error if the vma doesn't cover the range. */ - ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 3); - ASSERT_EQ(ret, -EINVAL); - - /* Page 2 will be a read-only zero page. */ - ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size, - PROT_READ); - ASSERT_EQ(ret, 0); - ptr = (int *)(buffer->ptr + 2 * self->page_size); - val = *ptr + 3; - ASSERT_EQ(val, 3); - - /* Page 3 will be read-only. */ - ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, - PROT_READ | PROT_WRITE); - ASSERT_EQ(ret, 0); - ptr = (int *)(buffer->ptr + 3 * self->page_size); - *ptr = val; - ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, - PROT_READ); - ASSERT_EQ(ret, 0); - - /* Page 4-5 will be read-write. */ - ret = mprotect(buffer->ptr + 4 * self->page_size, 2 * self->page_size, - PROT_READ | PROT_WRITE); - ASSERT_EQ(ret, 0); - ptr = (int *)(buffer->ptr + 4 * self->page_size); - *ptr = val; - ptr = (int *)(buffer->ptr + 5 * self->page_size); - *ptr = val; - - /* Now try to migrate pages 2-5 to device 1. */ - buffer->ptr = p + 2 * self->page_size; - ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 4); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, 4); - - /* Page 5 won't be migrated to device 0 because it's on device 1. */ - buffer->ptr = p + 5 * self->page_size; - ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); - ASSERT_EQ(ret, -ENOENT); - buffer->ptr = p; - - buffer->ptr = p; - hmm_buffer_free(buffer); -} - -/* - * Migrate anonymous memory to device memory and back to system memory - * multiple times. In case of private zone configuration, this is done - * through fault pages accessed by CPU. In case of coherent zone configuration, - * the pages from the device should be explicitly migrated back to system memory. - * The reason is Coherent device zone has coherent access by CPU, therefore - * it will not generate any page fault. - */ -TEST_F(hmm, migrate_multiple) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - unsigned long c; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - for (c = 0; c < NTIMES; c++) { - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Migrate memory to device. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Migrate back to system memory and check them. */ - if (hmm_is_coherent_type(variant->device_number)) { - ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - } - - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); - } -} - -/* - * Read anonymous memory multiple times. - */ -TEST_F(hmm, anon_read_multiple) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - unsigned long c; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - for (c = 0; c < NTIMES; c++) { - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i + c; - - /* Simulate a device reading system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, - npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i + c); - - hmm_buffer_free(buffer); - } -} - -void *unmap_buffer(void *p) -{ - struct hmm_buffer *buffer = p; - - /* Delay for a bit and then unmap buffer while it is being read. */ - hmm_nanosleep(hmm_random() % 32000); - munmap(buffer->ptr + buffer->size / 2, buffer->size / 2); - buffer->ptr = NULL; - - return NULL; -} - -/* - * Try reading anonymous memory while it is being unmapped. - */ -TEST_F(hmm, anon_teardown) -{ - unsigned long npages; - unsigned long size; - unsigned long c; - void *ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - for (c = 0; c < NTIMES; ++c) { - pthread_t thread; - struct hmm_buffer *buffer; - unsigned long i; - int *ptr; - int rc; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i + c; - - rc = pthread_create(&thread, NULL, unmap_buffer, buffer); - ASSERT_EQ(rc, 0); - - /* Simulate a device reading system memory. */ - rc = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, - npages); - if (rc == 0) { - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; - i < size / sizeof(*ptr); - ++i) - ASSERT_EQ(ptr[i], i + c); - } - - pthread_join(thread, &ret); - hmm_buffer_free(buffer); - } -} - -/* - * Test memory snapshot without faulting in pages accessed by the device. - */ -TEST_F(hmm, mixedmap) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned char *m; - int ret; - - npages = 1; - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(npages); - ASSERT_NE(buffer->mirror, NULL); - - - /* Reserve a range of addresses. */ - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE, - self->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Simulate a device snapshotting CPU pagetables. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device saw. */ - m = buffer->mirror; - ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ); - - hmm_buffer_free(buffer); -} - -/* - * Test memory snapshot without faulting in pages accessed by the device. - */ -TEST_F(hmm2, snapshot) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - int *ptr; - unsigned char *p; - unsigned char *m; - int ret; - int val; - - npages = 7; - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(npages); - ASSERT_NE(buffer->mirror, NULL); - - /* Reserve a range of addresses. */ - buffer->ptr = mmap(NULL, size, - PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - p = buffer->ptr; - - /* Punch a hole after the first page address. */ - ret = munmap(buffer->ptr + self->page_size, self->page_size); - ASSERT_EQ(ret, 0); - - /* Page 2 will be read-only zero page. */ - ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size, - PROT_READ); - ASSERT_EQ(ret, 0); - ptr = (int *)(buffer->ptr + 2 * self->page_size); - val = *ptr + 3; - ASSERT_EQ(val, 3); - - /* Page 3 will be read-only. */ - ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, - PROT_READ | PROT_WRITE); - ASSERT_EQ(ret, 0); - ptr = (int *)(buffer->ptr + 3 * self->page_size); - *ptr = val; - ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, - PROT_READ); - ASSERT_EQ(ret, 0); - - /* Page 4-6 will be read-write. */ - ret = mprotect(buffer->ptr + 4 * self->page_size, 3 * self->page_size, - PROT_READ | PROT_WRITE); - ASSERT_EQ(ret, 0); - ptr = (int *)(buffer->ptr + 4 * self->page_size); - *ptr = val; - - /* Page 5 will be migrated to device 0. */ - buffer->ptr = p + 5 * self->page_size; - ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, 1); - - /* Page 6 will be migrated to device 1. */ - buffer->ptr = p + 6 * self->page_size; - ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 1); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, 1); - - /* Simulate a device snapshotting CPU pagetables. */ - buffer->ptr = p; - ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_SNAPSHOT, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device saw. */ - m = buffer->mirror; - ASSERT_EQ(m[0], HMM_DMIRROR_PROT_ERROR); - ASSERT_EQ(m[1], HMM_DMIRROR_PROT_ERROR); - ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ); - ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ); - ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE); - if (!hmm_is_coherent_type(variant->device_number0)) { - ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | - HMM_DMIRROR_PROT_WRITE); - ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE); - } else { - ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | - HMM_DMIRROR_PROT_WRITE); - ASSERT_EQ(m[6], HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE | - HMM_DMIRROR_PROT_WRITE); - } - - hmm_buffer_free(buffer); -} - -/* - * Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that - * should be mapped by a large page table entry. - */ -TEST_F(hmm, compound) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long default_hsize; - int *ptr; - unsigned char *m; - int ret; - unsigned long i; - - /* Skip test if we can't allocate a hugetlbfs page. */ - - default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); - if (default_hsize < 0 || default_hsize*1024 < default_hsize) - SKIP(return, "Huge page size could not be determined"); - default_hsize = default_hsize*1024; /* KB to B */ - - size = ALIGN(TWOMEG, default_hsize); - npages = size >> self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, - -1, 0); - if (buffer->ptr == MAP_FAILED) { - free(buffer); - return; - } - - buffer->size = size; - buffer->mirror = malloc(npages); - ASSERT_NE(buffer->mirror, NULL); - - /* Initialize the pages the device will snapshot in buffer->ptr. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Simulate a device snapshotting CPU pagetables. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device saw. */ - m = buffer->mirror; - for (i = 0; i < npages; ++i) - ASSERT_EQ(m[i], HMM_DMIRROR_PROT_WRITE | - HMM_DMIRROR_PROT_PMD); - - /* Make the region read-only. */ - ret = mprotect(buffer->ptr, size, PROT_READ); - ASSERT_EQ(ret, 0); - - /* Simulate a device snapshotting CPU pagetables. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device saw. */ - m = buffer->mirror; - for (i = 0; i < npages; ++i) - ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ | - HMM_DMIRROR_PROT_PMD); - - munmap(buffer->ptr, buffer->size); - buffer->ptr = NULL; - hmm_buffer_free(buffer); -} - -/* - * Test two devices reading the same memory (double mapped). - */ -TEST_F(hmm2, double_map) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = 6; - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(npages); - ASSERT_NE(buffer->mirror, NULL); - - /* Reserve a range of addresses. */ - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Make region read-only. */ - ret = mprotect(buffer->ptr, size, PROT_READ); - ASSERT_EQ(ret, 0); - - /* Simulate device 0 reading system memory. */ - ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Simulate device 1 reading system memory. */ - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_READ, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Migrate pages to device 1 and try to read from device 0. */ - ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what device 0 read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Basic check of exclusive faulting. - */ -TEST_F(hmm, exclusive) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Map memory exclusively for device access. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Fault pages back to system memory and check them. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i]++, i); - - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i+1); - - /* Check atomic access revoked */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_CHECK_EXCLUSIVE, buffer, npages); - ASSERT_EQ(ret, 0); - - hmm_buffer_free(buffer); -} - -TEST_F(hmm, exclusive_mprotect) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Map memory exclusively for device access. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - ret = mprotect(buffer->ptr, size, PROT_READ); - ASSERT_EQ(ret, 0); - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, -EPERM); - - hmm_buffer_free(buffer); -} - -/* - * Check copy-on-write works. - */ -TEST_F(hmm, exclusive_cow) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Map memory exclusively for device access. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - fork(); - - /* Fault pages back to system memory and check them. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i]++, i); - - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i+1); - - hmm_buffer_free(buffer); -} - -static int gup_test_exec(int gup_fd, unsigned long addr, int cmd, - int npages, int size, int flags) -{ - struct gup_test gup = { - .nr_pages_per_call = npages, - .addr = addr, - .gup_flags = FOLL_WRITE | flags, - .size = size, - }; - - if (ioctl(gup_fd, cmd, &gup)) { - perror("ioctl on error\n"); - return errno; - } - - return 0; -} - -/* - * Test get user device pages through gup_test. Setting PIN_LONGTERM flag. - * This should trigger a migration back to system memory for both, private - * and coherent type pages. - * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added - * to your configuration before you run it. - */ -TEST_F(hmm, hmm_gup_test) -{ - struct hmm_buffer *buffer; - int gup_fd; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - unsigned char *m; - - gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); - if (gup_fd == -1) - SKIP(return, "Skipping test, could not find gup_test driver"); - - npages = 4; - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Migrate memory to device. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - ASSERT_EQ(gup_test_exec(gup_fd, - (unsigned long)buffer->ptr, - GUP_BASIC_TEST, 1, self->page_size, 0), 0); - ASSERT_EQ(gup_test_exec(gup_fd, - (unsigned long)buffer->ptr + 1 * self->page_size, - GUP_FAST_BENCHMARK, 1, self->page_size, 0), 0); - ASSERT_EQ(gup_test_exec(gup_fd, - (unsigned long)buffer->ptr + 2 * self->page_size, - PIN_FAST_BENCHMARK, 1, self->page_size, FOLL_LONGTERM), 0); - ASSERT_EQ(gup_test_exec(gup_fd, - (unsigned long)buffer->ptr + 3 * self->page_size, - PIN_LONGTERM_BENCHMARK, 1, self->page_size, 0), 0); - - /* Take snapshot to CPU pagetables */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - m = buffer->mirror; - if (hmm_is_coherent_type(variant->device_number)) { - ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[0]); - ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[1]); - } else { - ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]); - ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]); - } - ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[2]); - ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[3]); - /* - * Check again the content on the pages. Make sure there's no - * corrupted data. - */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - close(gup_fd); - hmm_buffer_free(buffer); -} - -/* - * Test copy-on-write in device pages. - * In case of writing to COW private page(s), a page fault will migrate pages - * back to system memory first. Then, these pages will be duplicated. In case - * of COW device coherent type, pages are duplicated directly from device - * memory. - */ -TEST_F(hmm, hmm_cow_in_device) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - unsigned char *m; - pid_t pid; - int status; - - npages = 4; - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Migrate memory to device. */ - - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - pid = fork(); - if (pid == -1) - ASSERT_EQ(pid, 0); - if (!pid) { - /* Child process waitd for SIGTERM from the parent. */ - while (1) { - } - perror("Should not reach this\n"); - exit(0); - } - /* Parent process writes to COW pages(s) and gets a - * new copy in system. In case of device private pages, - * this write causes a migration to system mem first. - */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Terminate child and wait */ - EXPECT_EQ(0, kill(pid, SIGTERM)); - EXPECT_EQ(pid, waitpid(pid, &status, 0)); - EXPECT_NE(0, WIFSIGNALED(status)); - EXPECT_EQ(SIGTERM, WTERMSIG(status)); - - /* Take snapshot to CPU pagetables */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - m = buffer->mirror; - for (i = 0; i < npages; i++) - ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]); - - hmm_buffer_free(buffer); -} -TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/vm/hugepage-mmap.c b/tools/testing/selftests/vm/hugepage-mmap.c deleted file mode 100644 index 955ef87f382c..000000000000 --- a/tools/testing/selftests/vm/hugepage-mmap.c +++ /dev/null @@ -1,91 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * hugepage-mmap: - * - * Example of using huge page memory in a user application using the mmap - * system call. Before running this application, make sure that the - * administrator has mounted the hugetlbfs filesystem (on some directory - * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this - * example, the app is requesting memory of size 256MB that is backed by - * huge pages. - * - * For the ia64 architecture, the Linux kernel reserves Region number 4 for - * huge pages. That means that if one requires a fixed address, a huge page - * aligned address starting with 0x800000... will be required. If a fixed - * address is not required, the kernel will select an address in the proper - * range. - * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. - */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include - -#define LENGTH (256UL*1024*1024) -#define PROTECTION (PROT_READ | PROT_WRITE) - -/* Only ia64 requires this */ -#ifdef __ia64__ -#define ADDR (void *)(0x8000000000000000UL) -#define FLAGS (MAP_SHARED | MAP_FIXED) -#else -#define ADDR (void *)(0x0UL) -#define FLAGS (MAP_SHARED) -#endif - -static void check_bytes(char *addr) -{ - printf("First hex is %x\n", *((unsigned int *)addr)); -} - -static void write_bytes(char *addr) -{ - unsigned long i; - - for (i = 0; i < LENGTH; i++) - *(addr + i) = (char)i; -} - -static int read_bytes(char *addr) -{ - unsigned long i; - - check_bytes(addr); - for (i = 0; i < LENGTH; i++) - if (*(addr + i) != (char)i) { - printf("Mismatch at %lu\n", i); - return 1; - } - return 0; -} - -int main(void) -{ - void *addr; - int fd, ret; - - fd = memfd_create("hugepage-mmap", MFD_HUGETLB); - if (fd < 0) { - perror("memfd_create() failed"); - exit(1); - } - - addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - close(fd); - exit(1); - } - - printf("Returned address is %p\n", addr); - check_bytes(addr); - write_bytes(addr); - ret = read_bytes(addr); - - munmap(addr, LENGTH); - close(fd); - - return ret; -} diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c deleted file mode 100644 index e53b5eaa8fce..000000000000 --- a/tools/testing/selftests/vm/hugepage-mremap.c +++ /dev/null @@ -1,188 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * hugepage-mremap: - * - * Example of remapping huge page memory in a user application using the - * mremap system call. The path to a file in a hugetlbfs filesystem must - * be passed as the last argument to this test. The amount of memory used - * by this test in MBs can optionally be passed as an argument. If no memory - * amount is passed, the default amount is 10MB. - * - * To make sure the test triggers pmd sharing and goes through the 'unshare' - * path in the mremap code use 1GB (1024) or more. - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include /* Definition of O_* constants */ -#include /* Definition of SYS_* constants */ -#include -#include -#include - -#define DEFAULT_LENGTH_MB 10UL -#define MB_TO_BYTES(x) (x * 1024 * 1024) - -#define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC) -#define FLAGS (MAP_SHARED | MAP_ANONYMOUS) - -static void check_bytes(char *addr) -{ - printf("First hex is %x\n", *((unsigned int *)addr)); -} - -static void write_bytes(char *addr, size_t len) -{ - unsigned long i; - - for (i = 0; i < len; i++) - *(addr + i) = (char)i; -} - -static int read_bytes(char *addr, size_t len) -{ - unsigned long i; - - check_bytes(addr); - for (i = 0; i < len; i++) - if (*(addr + i) != (char)i) { - printf("Mismatch at %lu\n", i); - return 1; - } - return 0; -} - -static void register_region_with_uffd(char *addr, size_t len) -{ - long uffd; /* userfaultfd file descriptor */ - struct uffdio_api uffdio_api; - struct uffdio_register uffdio_register; - - /* Create and enable userfaultfd object. */ - - uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); - if (uffd == -1) { - perror("userfaultfd"); - exit(1); - } - - uffdio_api.api = UFFD_API; - uffdio_api.features = 0; - if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { - perror("ioctl-UFFDIO_API"); - exit(1); - } - - /* Create a private anonymous mapping. The memory will be - * demand-zero paged--that is, not yet allocated. When we - * actually touch the memory, it will be allocated via - * the userfaultfd. - */ - - addr = mmap(NULL, len, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - printf("Address returned by mmap() = %p\n", addr); - - /* Register the memory range of the mapping we just created for - * handling by the userfaultfd object. In mode, we request to track - * missing pages (i.e., pages that have not yet been faulted in). - */ - - uffdio_register.range.start = (unsigned long)addr; - uffdio_register.range.len = len; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { - perror("ioctl-UFFDIO_REGISTER"); - exit(1); - } -} - -int main(int argc, char *argv[]) -{ - size_t length = 0; - int ret = 0, fd; - - if (argc >= 2 && !strcmp(argv[1], "-h")) { - printf("Usage: %s [length_in_MB]\n", argv[0]); - exit(1); - } - - /* Read memory length as the first arg if valid, otherwise fallback to - * the default length. - */ - if (argc >= 2) - length = (size_t)atoi(argv[1]); - else - length = DEFAULT_LENGTH_MB; - - length = MB_TO_BYTES(length); - fd = memfd_create(argv[0], MFD_HUGETLB); - if (fd < 0) { - perror("Open failed"); - exit(1); - } - - /* mmap to a PUD aligned address to hopefully trigger pmd sharing. */ - unsigned long suggested_addr = 0x7eaa40000000; - void *haddr = mmap((void *)suggested_addr, length, PROTECTION, - MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); - printf("Map haddr: Returned address is %p\n", haddr); - if (haddr == MAP_FAILED) { - perror("mmap1"); - exit(1); - } - - /* mmap again to a dummy address to hopefully trigger pmd sharing. */ - suggested_addr = 0x7daa40000000; - void *daddr = mmap((void *)suggested_addr, length, PROTECTION, - MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); - printf("Map daddr: Returned address is %p\n", daddr); - if (daddr == MAP_FAILED) { - perror("mmap3"); - exit(1); - } - - suggested_addr = 0x7faa40000000; - void *vaddr = - mmap((void *)suggested_addr, length, PROTECTION, FLAGS, -1, 0); - printf("Map vaddr: Returned address is %p\n", vaddr); - if (vaddr == MAP_FAILED) { - perror("mmap2"); - exit(1); - } - - register_region_with_uffd(haddr, length); - - void *addr = mremap(haddr, length, length, - MREMAP_MAYMOVE | MREMAP_FIXED, vaddr); - if (addr == MAP_FAILED) { - perror("mremap"); - exit(1); - } - - printf("Mremap: Returned address is %p\n", addr); - check_bytes(addr); - write_bytes(addr, length); - ret = read_bytes(addr, length); - - munmap(addr, length); - - addr = mremap(addr, length, length, 0); - if (addr != MAP_FAILED) { - printf("mremap: Expected failure, but call succeeded\n"); - exit(1); - } - - close(fd); - - return ret; -} diff --git a/tools/testing/selftests/vm/hugepage-shm.c b/tools/testing/selftests/vm/hugepage-shm.c deleted file mode 100644 index e2527f32005b..000000000000 --- a/tools/testing/selftests/vm/hugepage-shm.c +++ /dev/null @@ -1,101 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * hugepage-shm: - * - * Example of using huge page memory in a user application using Sys V shared - * memory system calls. In this example the app is requesting 256MB of - * memory that is backed by huge pages. The application uses the flag - * SHM_HUGETLB in the shmget system call to inform the kernel that it is - * requesting huge pages. - * - * For the ia64 architecture, the Linux kernel reserves Region number 4 for - * huge pages. That means that if one requires a fixed address, a huge page - * aligned address starting with 0x800000... will be required. If a fixed - * address is not required, the kernel will select an address in the proper - * range. - * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. - * - * Note: The default shared memory limit is quite low on many kernels, - * you may need to increase it via: - * - * echo 268435456 > /proc/sys/kernel/shmmax - * - * This will increase the maximum size per shared memory segment to 256MB. - * The other limit that you will hit eventually is shmall which is the - * total amount of shared memory in pages. To set it to 16GB on a system - * with a 4kB pagesize do: - * - * echo 4194304 > /proc/sys/kernel/shmall - */ - -#include -#include -#include -#include -#include -#include - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - -#define LENGTH (256UL*1024*1024) - -#define dprintf(x) printf(x) - -/* Only ia64 requires this */ -#ifdef __ia64__ -#define ADDR (void *)(0x8000000000000000UL) -#define SHMAT_FLAGS (SHM_RND) -#else -#define ADDR (void *)(0x0UL) -#define SHMAT_FLAGS (0) -#endif - -int main(void) -{ - int shmid; - unsigned long i; - char *shmaddr; - - shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); - if (shmid < 0) { - perror("shmget"); - exit(1); - } - printf("shmid: 0x%x\n", shmid); - - shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS); - if (shmaddr == (char *)-1) { - perror("Shared memory attach failure"); - shmctl(shmid, IPC_RMID, NULL); - exit(2); - } - printf("shmaddr: %p\n", shmaddr); - - dprintf("Starting the writes:\n"); - for (i = 0; i < LENGTH; i++) { - shmaddr[i] = (char)(i); - if (!(i % (1024 * 1024))) - dprintf("."); - } - dprintf("\n"); - - dprintf("Starting the Check..."); - for (i = 0; i < LENGTH; i++) - if (shmaddr[i] != (char)i) { - printf("\nIndex %lu mismatched\n", i); - exit(3); - } - dprintf("Done.\n"); - - if (shmdt((const void *)shmaddr) != 0) { - perror("Detach failure"); - shmctl(shmid, IPC_RMID, NULL); - exit(4); - } - - shmctl(shmid, IPC_RMID, NULL); - - return 0; -} diff --git a/tools/testing/selftests/vm/hugepage-vmemmap.c b/tools/testing/selftests/vm/hugepage-vmemmap.c deleted file mode 100644 index 557bdbd4f87e..000000000000 --- a/tools/testing/selftests/vm/hugepage-vmemmap.c +++ /dev/null @@ -1,144 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * A test case of using hugepage memory in a user application using the - * mmap system call with MAP_HUGETLB flag. Before running this program - * make sure the administrator has allocated enough default sized huge - * pages to cover the 2 MB allocation. - */ -#include -#include -#include -#include -#include - -#define MAP_LENGTH (2UL * 1024 * 1024) - -#ifndef MAP_HUGETLB -#define MAP_HUGETLB 0x40000 /* arch specific */ -#endif - -#define PAGE_SIZE 4096 - -#define PAGE_COMPOUND_HEAD (1UL << 15) -#define PAGE_COMPOUND_TAIL (1UL << 16) -#define PAGE_HUGE (1UL << 17) - -#define HEAD_PAGE_FLAGS (PAGE_COMPOUND_HEAD | PAGE_HUGE) -#define TAIL_PAGE_FLAGS (PAGE_COMPOUND_TAIL | PAGE_HUGE) - -#define PM_PFRAME_BITS 55 -#define PM_PFRAME_MASK ~((1UL << PM_PFRAME_BITS) - 1) - -/* - * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. - * That means the addresses starting with 0x800000... will need to be - * specified. Specifying a fixed address is not required on ppc64, i386 - * or x86_64. - */ -#ifdef __ia64__ -#define MAP_ADDR (void *)(0x8000000000000000UL) -#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) -#else -#define MAP_ADDR NULL -#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) -#endif - -static void write_bytes(char *addr, size_t length) -{ - unsigned long i; - - for (i = 0; i < length; i++) - *(addr + i) = (char)i; -} - -static unsigned long virt_to_pfn(void *addr) -{ - int fd; - unsigned long pagemap; - - fd = open("/proc/self/pagemap", O_RDONLY); - if (fd < 0) - return -1UL; - - lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET); - read(fd, &pagemap, sizeof(pagemap)); - close(fd); - - return pagemap & ~PM_PFRAME_MASK; -} - -static int check_page_flags(unsigned long pfn) -{ - int fd, i; - unsigned long pageflags; - - fd = open("/proc/kpageflags", O_RDONLY); - if (fd < 0) - return -1; - - lseek(fd, pfn * sizeof(pageflags), SEEK_SET); - - read(fd, &pageflags, sizeof(pageflags)); - if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) { - close(fd); - printf("Head page flags (%lx) is invalid\n", pageflags); - return -1; - } - - /* - * pages other than the first page must be tail and shouldn't be head; - * this also verifies kernel has correctly set the fake page_head to tail - * while hugetlb_free_vmemmap is enabled. - */ - for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) { - read(fd, &pageflags, sizeof(pageflags)); - if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS || - (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) { - close(fd); - printf("Tail page flags (%lx) is invalid\n", pageflags); - return -1; - } - } - - close(fd); - - return 0; -} - -int main(int argc, char **argv) -{ - void *addr; - unsigned long pfn; - - addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - /* Trigger allocation of HugeTLB page. */ - write_bytes(addr, MAP_LENGTH); - - pfn = virt_to_pfn(addr); - if (pfn == -1UL) { - munmap(addr, MAP_LENGTH); - perror("virt_to_pfn"); - exit(1); - } - - printf("Returned address is %p whose pfn is %lx\n", addr, pfn); - - if (check_page_flags(pfn) < 0) { - munmap(addr, MAP_LENGTH); - perror("check_page_flags"); - exit(1); - } - - /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ - if (munmap(addr, MAP_LENGTH)) { - perror("munmap"); - exit(1); - } - - return 0; -} diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/vm/hugetlb-madvise.c deleted file mode 100644 index a634f47d1e56..000000000000 --- a/tools/testing/selftests/vm/hugetlb-madvise.c +++ /dev/null @@ -1,406 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * hugepage-madvise: - * - * Basic functional testing of madvise MADV_DONTNEED and MADV_REMOVE - * on hugetlb mappings. - * - * Before running this test, make sure the administrator has pre-allocated - * at least MIN_FREE_PAGES hugetlb pages and they are free. In addition, - * the test takes an argument that is the path to a file in a hugetlbfs - * filesystem. Therefore, a hugetlbfs filesystem must be mounted on some - * directory. - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#define __USE_GNU -#include - -#define MIN_FREE_PAGES 20 -#define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */ - -#define validate_free_pages(exp_free) \ - do { \ - int fhp = get_free_hugepages(); \ - if (fhp != (exp_free)) { \ - printf("Unexpected number of free huge " \ - "pages line %d\n", __LINE__); \ - exit(1); \ - } \ - } while (0) - -unsigned long huge_page_size; -unsigned long base_page_size; - -/* - * default_huge_page_size copied from mlock2-tests.c - */ -unsigned long default_huge_page_size(void) -{ - unsigned long hps = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - - if (!f) - return 0; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { - hps <<= 10; - break; - } - } - - free(line); - fclose(f); - return hps; -} - -unsigned long get_free_hugepages(void) -{ - unsigned long fhp = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - - if (!f) - return fhp; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "HugePages_Free: %lu", &fhp) == 1) - break; - } - - free(line); - fclose(f); - return fhp; -} - -void write_fault_pages(void *addr, unsigned long nr_pages) -{ - unsigned long i; - - for (i = 0; i < nr_pages; i++) - *((unsigned long *)(addr + (i * huge_page_size))) = i; -} - -void read_fault_pages(void *addr, unsigned long nr_pages) -{ - unsigned long dummy = 0; - unsigned long i; - - for (i = 0; i < nr_pages; i++) - dummy += *((unsigned long *)(addr + (i * huge_page_size))); -} - -int main(int argc, char **argv) -{ - unsigned long free_hugepages; - void *addr, *addr2; - int fd; - int ret; - - huge_page_size = default_huge_page_size(); - if (!huge_page_size) { - printf("Unable to determine huge page size, exiting!\n"); - exit(1); - } - base_page_size = sysconf(_SC_PAGE_SIZE); - if (!huge_page_size) { - printf("Unable to determine base page size, exiting!\n"); - exit(1); - } - - free_hugepages = get_free_hugepages(); - if (free_hugepages < MIN_FREE_PAGES) { - printf("Not enough free huge pages to test, exiting!\n"); - exit(1); - } - - fd = memfd_create(argv[0], MFD_HUGETLB); - if (fd < 0) { - perror("memfd_create() failed"); - exit(1); - } - - /* - * Test validity of MADV_DONTNEED addr and length arguments. mmap - * size is NR_HUGE_PAGES + 2. One page at the beginning and end of - * the mapping will be unmapped so we KNOW there is nothing mapped - * there. - */ - addr = mmap(NULL, (NR_HUGE_PAGES + 2) * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, - -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - if (munmap(addr, huge_page_size) || - munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size, - huge_page_size)) { - perror("munmap"); - exit(1); - } - addr = addr + huge_page_size; - - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* addr before mapping should fail */ - ret = madvise(addr - base_page_size, NR_HUGE_PAGES * huge_page_size, - MADV_DONTNEED); - if (!ret) { - printf("Unexpected success of madvise call with invalid addr line %d\n", - __LINE__); - exit(1); - } - - /* addr + length after mapping should fail */ - ret = madvise(addr, (NR_HUGE_PAGES * huge_page_size) + base_page_size, - MADV_DONTNEED); - if (!ret) { - printf("Unexpected success of madvise call with invalid length line %d\n", - __LINE__); - exit(1); - } - - (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); - - /* - * Test alignment of MADV_DONTNEED addr and length arguments - */ - addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, - -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* addr is not huge page size aligned and should fail */ - ret = madvise(addr + base_page_size, - NR_HUGE_PAGES * huge_page_size - base_page_size, - MADV_DONTNEED); - if (!ret) { - printf("Unexpected success of madvise call with unaligned start address %d\n", - __LINE__); - exit(1); - } - - /* addr + length should be aligned down to huge page size */ - if (madvise(addr, - ((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size, - MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - - /* should free all but last page in mapping */ - validate_free_pages(free_hugepages - 1); - - (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); - validate_free_pages(free_hugepages); - - /* - * Test MADV_DONTNEED on anonymous private mapping - */ - addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, - -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - - /* should free all pages in mapping */ - validate_free_pages(free_hugepages); - - (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); - - /* - * Test MADV_DONTNEED on private mapping of hugetlb file - */ - if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE, fd, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - /* read should not consume any pages */ - read_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* madvise should not free any pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* writes should allocate private pages */ - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); - - /* madvise should free private pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* writes should allocate private pages */ - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); - - /* - * The fallocate below certainly should free the pages associated - * with the file. However, pages in the private mapping are also - * freed. This is not the 'correct' behavior, but is expected - * because this is how it has worked since the initial hugetlb - * implementation. - */ - if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } - validate_free_pages(free_hugepages); - - (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); - - /* - * Test MADV_DONTNEED on shared mapping of hugetlb file - */ - if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_SHARED, fd, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - /* write should not consume any pages */ - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* madvise should not free any pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* - * Test MADV_REMOVE on shared mapping of hugetlb file - * - * madvise is same as hole punch and should free all pages. - */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages); - (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); - - /* - * Test MADV_REMOVE on shared and private mapping of hugetlb file - */ - if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_SHARED, fd, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - /* shared write should not consume any additional pages */ - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - addr2 = mmap(NULL, NR_HUGE_PAGES * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE, fd, 0); - if (addr2 == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - /* private read should not consume any pages */ - read_fault_pages(addr2, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* private write should consume additional pages */ - write_fault_pages(addr2, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); - - /* madvise of shared mapping should not free any pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); - - /* madvise of private mapping should free private pages */ - if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* private write should consume additional pages again */ - write_fault_pages(addr2, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); - - /* - * madvise should free both file and private pages although this is - * not correct. private pages should not be freed, but this is - * expected. See comment associated with FALLOC_FL_PUNCH_HOLE call. - */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages); - - (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); - (void)munmap(addr2, NR_HUGE_PAGES * huge_page_size); - - close(fd); - return 0; -} diff --git a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh deleted file mode 100644 index bf2d2a684edf..000000000000 --- a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh +++ /dev/null @@ -1,252 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -set -e - -if [[ $(id -u) -ne 0 ]]; then - echo "This test must be run as root. Skipping..." - exit $ksft_skip -fi - -usage_file=usage_in_bytes - -if [[ "$1" == "-cgroup-v2" ]]; then - cgroup2=1 - usage_file=current -fi - - -if [[ $cgroup2 ]]; then - CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk -e '{print $3}') - if [[ -z "$CGROUP_ROOT" ]]; then - CGROUP_ROOT=/dev/cgroup/memory - mount -t cgroup2 none $CGROUP_ROOT - do_umount=1 - fi - echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control -else - CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}') - if [[ -z "$CGROUP_ROOT" ]]; then - CGROUP_ROOT=/dev/cgroup/memory - mount -t cgroup memory,hugetlb $CGROUP_ROOT - do_umount=1 - fi -fi -MNT='/mnt/huge/' - -function get_machine_hugepage_size() { - hpz=$(grep -i hugepagesize /proc/meminfo) - kb=${hpz:14:-3} - mb=$(($kb / 1024)) - echo $mb -} - -MB=$(get_machine_hugepage_size) - -function cleanup() { - echo cleanup - set +e - rm -rf "$MNT"/* 2>/dev/null - umount "$MNT" 2>/dev/null - rmdir "$MNT" 2>/dev/null - rmdir "$CGROUP_ROOT"/a/b 2>/dev/null - rmdir "$CGROUP_ROOT"/a 2>/dev/null - rmdir "$CGROUP_ROOT"/test1 2>/dev/null - echo 0 >/proc/sys/vm/nr_hugepages - set -e -} - -function assert_state() { - local expected_a="$1" - local expected_a_hugetlb="$2" - local expected_b="" - local expected_b_hugetlb="" - - if [ ! -z ${3:-} ] && [ ! -z ${4:-} ]; then - expected_b="$3" - expected_b_hugetlb="$4" - fi - local tolerance=$((5 * 1024 * 1024)) - - local actual_a - actual_a="$(cat "$CGROUP_ROOT"/a/memory.$usage_file)" - if [[ $actual_a -lt $(($expected_a - $tolerance)) ]] || - [[ $actual_a -gt $(($expected_a + $tolerance)) ]]; then - echo actual a = $((${actual_a%% *} / 1024 / 1024)) MB - echo expected a = $((${expected_a%% *} / 1024 / 1024)) MB - echo fail - - cleanup - exit 1 - fi - - local actual_a_hugetlb - actual_a_hugetlb="$(cat "$CGROUP_ROOT"/a/hugetlb.${MB}MB.$usage_file)" - if [[ $actual_a_hugetlb -lt $(($expected_a_hugetlb - $tolerance)) ]] || - [[ $actual_a_hugetlb -gt $(($expected_a_hugetlb + $tolerance)) ]]; then - echo actual a hugetlb = $((${actual_a_hugetlb%% *} / 1024 / 1024)) MB - echo expected a hugetlb = $((${expected_a_hugetlb%% *} / 1024 / 1024)) MB - echo fail - - cleanup - exit 1 - fi - - if [[ -z "$expected_b" || -z "$expected_b_hugetlb" ]]; then - return - fi - - local actual_b - actual_b="$(cat "$CGROUP_ROOT"/a/b/memory.$usage_file)" - if [[ $actual_b -lt $(($expected_b - $tolerance)) ]] || - [[ $actual_b -gt $(($expected_b + $tolerance)) ]]; then - echo actual b = $((${actual_b%% *} / 1024 / 1024)) MB - echo expected b = $((${expected_b%% *} / 1024 / 1024)) MB - echo fail - - cleanup - exit 1 - fi - - local actual_b_hugetlb - actual_b_hugetlb="$(cat "$CGROUP_ROOT"/a/b/hugetlb.${MB}MB.$usage_file)" - if [[ $actual_b_hugetlb -lt $(($expected_b_hugetlb - $tolerance)) ]] || - [[ $actual_b_hugetlb -gt $(($expected_b_hugetlb + $tolerance)) ]]; then - echo actual b hugetlb = $((${actual_b_hugetlb%% *} / 1024 / 1024)) MB - echo expected b hugetlb = $((${expected_b_hugetlb%% *} / 1024 / 1024)) MB - echo fail - - cleanup - exit 1 - fi -} - -function setup() { - echo 100 >/proc/sys/vm/nr_hugepages - mkdir "$CGROUP_ROOT"/a - sleep 1 - if [[ $cgroup2 ]]; then - echo "+hugetlb +memory" >$CGROUP_ROOT/a/cgroup.subtree_control - else - echo 0 >$CGROUP_ROOT/a/cpuset.mems - echo 0 >$CGROUP_ROOT/a/cpuset.cpus - fi - - mkdir "$CGROUP_ROOT"/a/b - - if [[ ! $cgroup2 ]]; then - echo 0 >$CGROUP_ROOT/a/b/cpuset.mems - echo 0 >$CGROUP_ROOT/a/b/cpuset.cpus - fi - - mkdir -p "$MNT" - mount -t hugetlbfs none "$MNT" -} - -write_hugetlbfs() { - local cgroup="$1" - local path="$2" - local size="$3" - - if [[ $cgroup2 ]]; then - echo $$ >$CGROUP_ROOT/$cgroup/cgroup.procs - else - echo 0 >$CGROUP_ROOT/$cgroup/cpuset.mems - echo 0 >$CGROUP_ROOT/$cgroup/cpuset.cpus - echo $$ >"$CGROUP_ROOT/$cgroup/tasks" - fi - ./write_to_hugetlbfs -p "$path" -s "$size" -m 0 -o - if [[ $cgroup2 ]]; then - echo $$ >$CGROUP_ROOT/cgroup.procs - else - echo $$ >"$CGROUP_ROOT/tasks" - fi - echo -} - -set -e - -size=$((${MB} * 1024 * 1024 * 25)) # 50MB = 25 * 2MB hugepages. - -cleanup - -echo -echo -echo Test charge, rmdir, uncharge -setup -echo mkdir -mkdir $CGROUP_ROOT/test1 - -echo write -write_hugetlbfs test1 "$MNT"/test $size - -echo rmdir -rmdir $CGROUP_ROOT/test1 -mkdir $CGROUP_ROOT/test1 - -echo uncharge -rm -rf /mnt/huge/* - -cleanup - -echo done -echo -echo -if [[ ! $cgroup2 ]]; then - echo "Test parent and child hugetlb usage" - setup - - echo write - write_hugetlbfs a "$MNT"/test $size - - echo Assert memory charged correctly for parent use. - assert_state 0 $size 0 0 - - write_hugetlbfs a/b "$MNT"/test2 $size - - echo Assert memory charged correctly for child use. - assert_state 0 $(($size * 2)) 0 $size - - rmdir "$CGROUP_ROOT"/a/b - sleep 5 - echo Assert memory reparent correctly. - assert_state 0 $(($size * 2)) - - rm -rf "$MNT"/* - umount "$MNT" - echo Assert memory uncharged correctly. - assert_state 0 0 - - cleanup -fi - -echo -echo -echo "Test child only hugetlb usage" -echo setup -setup - -echo write -write_hugetlbfs a/b "$MNT"/test2 $size - -echo Assert memory charged correctly for child only use. -assert_state 0 $(($size)) 0 $size - -rmdir "$CGROUP_ROOT"/a/b -echo Assert memory reparent correctly. -assert_state 0 $size - -rm -rf "$MNT"/* -umount "$MNT" -echo Assert memory uncharged correctly. -assert_state 0 0 - -cleanup - -echo ALL PASS - -umount $CGROUP_ROOT -rm -rf $CGROUP_ROOT diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c deleted file mode 100644 index 64126c8cd561..000000000000 --- a/tools/testing/selftests/vm/khugepaged.c +++ /dev/null @@ -1,1558 +0,0 @@ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "linux/magic.h" - -#include "vm_util.h" - -#ifndef MADV_PAGEOUT -#define MADV_PAGEOUT 21 -#endif -#ifndef MADV_POPULATE_READ -#define MADV_POPULATE_READ 22 -#endif -#ifndef MADV_COLLAPSE -#define MADV_COLLAPSE 25 -#endif - -#define BASE_ADDR ((void *)(1UL << 30)) -static unsigned long hpage_pmd_size; -static unsigned long page_size; -static int hpage_pmd_nr; - -#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" -#define PID_SMAPS "/proc/self/smaps" -#define TEST_FILE "collapse_test_file" - -#define MAX_LINE_LENGTH 500 - -enum vma_type { - VMA_ANON, - VMA_FILE, - VMA_SHMEM, -}; - -struct mem_ops { - void *(*setup_area)(int nr_hpages); - void (*cleanup_area)(void *p, unsigned long size); - void (*fault)(void *p, unsigned long start, unsigned long end); - bool (*check_huge)(void *addr, int nr_hpages); - const char *name; -}; - -static struct mem_ops *file_ops; -static struct mem_ops *anon_ops; -static struct mem_ops *shmem_ops; - -struct collapse_context { - void (*collapse)(const char *msg, char *p, int nr_hpages, - struct mem_ops *ops, bool expect); - bool enforce_pte_scan_limits; - const char *name; -}; - -static struct collapse_context *khugepaged_context; -static struct collapse_context *madvise_context; - -struct file_info { - const char *dir; - char path[PATH_MAX]; - enum vma_type type; - int fd; - char dev_queue_read_ahead_path[PATH_MAX]; -}; - -static struct file_info finfo; - -enum thp_enabled { - THP_ALWAYS, - THP_MADVISE, - THP_NEVER, -}; - -static const char *thp_enabled_strings[] = { - "always", - "madvise", - "never", - NULL -}; - -enum thp_defrag { - THP_DEFRAG_ALWAYS, - THP_DEFRAG_DEFER, - THP_DEFRAG_DEFER_MADVISE, - THP_DEFRAG_MADVISE, - THP_DEFRAG_NEVER, -}; - -static const char *thp_defrag_strings[] = { - "always", - "defer", - "defer+madvise", - "madvise", - "never", - NULL -}; - -enum shmem_enabled { - SHMEM_ALWAYS, - SHMEM_WITHIN_SIZE, - SHMEM_ADVISE, - SHMEM_NEVER, - SHMEM_DENY, - SHMEM_FORCE, -}; - -static const char *shmem_enabled_strings[] = { - "always", - "within_size", - "advise", - "never", - "deny", - "force", - NULL -}; - -struct khugepaged_settings { - bool defrag; - unsigned int alloc_sleep_millisecs; - unsigned int scan_sleep_millisecs; - unsigned int max_ptes_none; - unsigned int max_ptes_swap; - unsigned int max_ptes_shared; - unsigned long pages_to_scan; -}; - -struct settings { - enum thp_enabled thp_enabled; - enum thp_defrag thp_defrag; - enum shmem_enabled shmem_enabled; - bool use_zero_page; - struct khugepaged_settings khugepaged; - unsigned long read_ahead_kb; -}; - -static struct settings saved_settings; -static bool skip_settings_restore; - -static int exit_status; - -static void success(const char *msg) -{ - printf(" \e[32m%s\e[0m\n", msg); -} - -static void fail(const char *msg) -{ - printf(" \e[31m%s\e[0m\n", msg); - exit_status++; -} - -static void skip(const char *msg) -{ - printf(" \e[33m%s\e[0m\n", msg); -} - -static int read_file(const char *path, char *buf, size_t buflen) -{ - int fd; - ssize_t numread; - - fd = open(path, O_RDONLY); - if (fd == -1) - return 0; - - numread = read(fd, buf, buflen - 1); - if (numread < 1) { - close(fd); - return 0; - } - - buf[numread] = '\0'; - close(fd); - - return (unsigned int) numread; -} - -static int write_file(const char *path, const char *buf, size_t buflen) -{ - int fd; - ssize_t numwritten; - - fd = open(path, O_WRONLY); - if (fd == -1) { - printf("open(%s)\n", path); - exit(EXIT_FAILURE); - return 0; - } - - numwritten = write(fd, buf, buflen - 1); - close(fd); - if (numwritten < 1) { - printf("write(%s)\n", buf); - exit(EXIT_FAILURE); - return 0; - } - - return (unsigned int) numwritten; -} - -static int read_string(const char *name, const char *strings[]) -{ - char path[PATH_MAX]; - char buf[256]; - char *c; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - - if (!read_file(path, buf, sizeof(buf))) { - perror(path); - exit(EXIT_FAILURE); - } - - c = strchr(buf, '['); - if (!c) { - printf("%s: Parse failure\n", __func__); - exit(EXIT_FAILURE); - } - - c++; - memmove(buf, c, sizeof(buf) - (c - buf)); - - c = strchr(buf, ']'); - if (!c) { - printf("%s: Parse failure\n", __func__); - exit(EXIT_FAILURE); - } - *c = '\0'; - - ret = 0; - while (strings[ret]) { - if (!strcmp(strings[ret], buf)) - return ret; - ret++; - } - - printf("Failed to parse %s\n", name); - exit(EXIT_FAILURE); -} - -static void write_string(const char *name, const char *val) -{ - char path[PATH_MAX]; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - - if (!write_file(path, val, strlen(val) + 1)) { - perror(path); - exit(EXIT_FAILURE); - } -} - -static const unsigned long _read_num(const char *path) -{ - char buf[21]; - - if (read_file(path, buf, sizeof(buf)) < 0) { - perror("read_file(read_num)"); - exit(EXIT_FAILURE); - } - - return strtoul(buf, NULL, 10); -} - -static const unsigned long read_num(const char *name) -{ - char path[PATH_MAX]; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - return _read_num(path); -} - -static void _write_num(const char *path, unsigned long num) -{ - char buf[21]; - - sprintf(buf, "%ld", num); - if (!write_file(path, buf, strlen(buf) + 1)) { - perror(path); - exit(EXIT_FAILURE); - } -} - -static void write_num(const char *name, unsigned long num) -{ - char path[PATH_MAX]; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - _write_num(path, num); -} - -static void write_settings(struct settings *settings) -{ - struct khugepaged_settings *khugepaged = &settings->khugepaged; - - write_string("enabled", thp_enabled_strings[settings->thp_enabled]); - write_string("defrag", thp_defrag_strings[settings->thp_defrag]); - write_string("shmem_enabled", - shmem_enabled_strings[settings->shmem_enabled]); - write_num("use_zero_page", settings->use_zero_page); - - write_num("khugepaged/defrag", khugepaged->defrag); - write_num("khugepaged/alloc_sleep_millisecs", - khugepaged->alloc_sleep_millisecs); - write_num("khugepaged/scan_sleep_millisecs", - khugepaged->scan_sleep_millisecs); - write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none); - write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); - write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); - write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); - - if (file_ops && finfo.type == VMA_FILE) - _write_num(finfo.dev_queue_read_ahead_path, - settings->read_ahead_kb); -} - -#define MAX_SETTINGS_DEPTH 4 -static struct settings settings_stack[MAX_SETTINGS_DEPTH]; -static int settings_index; - -static struct settings *current_settings(void) -{ - if (!settings_index) { - printf("Fail: No settings set"); - exit(EXIT_FAILURE); - } - return settings_stack + settings_index - 1; -} - -static void push_settings(struct settings *settings) -{ - if (settings_index >= MAX_SETTINGS_DEPTH) { - printf("Fail: Settings stack exceeded"); - exit(EXIT_FAILURE); - } - settings_stack[settings_index++] = *settings; - write_settings(current_settings()); -} - -static void pop_settings(void) -{ - if (settings_index <= 0) { - printf("Fail: Settings stack empty"); - exit(EXIT_FAILURE); - } - --settings_index; - write_settings(current_settings()); -} - -static void restore_settings(int sig) -{ - if (skip_settings_restore) - goto out; - - printf("Restore THP and khugepaged settings..."); - write_settings(&saved_settings); - success("OK"); - if (sig) - exit(EXIT_FAILURE); -out: - exit(exit_status); -} - -static void save_settings(void) -{ - printf("Save THP and khugepaged settings..."); - saved_settings = (struct settings) { - .thp_enabled = read_string("enabled", thp_enabled_strings), - .thp_defrag = read_string("defrag", thp_defrag_strings), - .shmem_enabled = - read_string("shmem_enabled", shmem_enabled_strings), - .use_zero_page = read_num("use_zero_page"), - }; - saved_settings.khugepaged = (struct khugepaged_settings) { - .defrag = read_num("khugepaged/defrag"), - .alloc_sleep_millisecs = - read_num("khugepaged/alloc_sleep_millisecs"), - .scan_sleep_millisecs = - read_num("khugepaged/scan_sleep_millisecs"), - .max_ptes_none = read_num("khugepaged/max_ptes_none"), - .max_ptes_swap = read_num("khugepaged/max_ptes_swap"), - .max_ptes_shared = read_num("khugepaged/max_ptes_shared"), - .pages_to_scan = read_num("khugepaged/pages_to_scan"), - }; - if (file_ops && finfo.type == VMA_FILE) - saved_settings.read_ahead_kb = - _read_num(finfo.dev_queue_read_ahead_path); - - success("OK"); - - signal(SIGTERM, restore_settings); - signal(SIGINT, restore_settings); - signal(SIGHUP, restore_settings); - signal(SIGQUIT, restore_settings); -} - -static void get_finfo(const char *dir) -{ - struct stat path_stat; - struct statfs fs; - char buf[1 << 10]; - char path[PATH_MAX]; - char *str, *end; - - finfo.dir = dir; - stat(finfo.dir, &path_stat); - if (!S_ISDIR(path_stat.st_mode)) { - printf("%s: Not a directory (%s)\n", __func__, finfo.dir); - exit(EXIT_FAILURE); - } - if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE, - finfo.dir) >= sizeof(finfo.path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - if (statfs(finfo.dir, &fs)) { - perror("statfs()"); - exit(EXIT_FAILURE); - } - finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE; - if (finfo.type == VMA_SHMEM) - return; - - /* Find owning device's queue/read_ahead_kb control */ - if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent", - major(path_stat.st_dev), minor(path_stat.st_dev)) - >= sizeof(path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - if (read_file(path, buf, sizeof(buf)) < 0) { - perror("read_file(read_num)"); - exit(EXIT_FAILURE); - } - if (strstr(buf, "DEVTYPE=disk")) { - /* Found it */ - if (snprintf(finfo.dev_queue_read_ahead_path, - sizeof(finfo.dev_queue_read_ahead_path), - "/sys/dev/block/%d:%d/queue/read_ahead_kb", - major(path_stat.st_dev), minor(path_stat.st_dev)) - >= sizeof(finfo.dev_queue_read_ahead_path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - return; - } - if (!strstr(buf, "DEVTYPE=partition")) { - printf("%s: Unknown device type: %s\n", __func__, path); - exit(EXIT_FAILURE); - } - /* - * Partition of block device - need to find actual device. - * Using naming convention that devnameN is partition of - * device devname. - */ - str = strstr(buf, "DEVNAME="); - if (!str) { - printf("%s: Could not read: %s", __func__, path); - exit(EXIT_FAILURE); - } - str += 8; - end = str; - while (*end) { - if (isdigit(*end)) { - *end = '\0'; - if (snprintf(finfo.dev_queue_read_ahead_path, - sizeof(finfo.dev_queue_read_ahead_path), - "/sys/block/%s/queue/read_ahead_kb", - str) >= sizeof(finfo.dev_queue_read_ahead_path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - return; - } - ++end; - } - printf("%s: Could not read: %s\n", __func__, path); - exit(EXIT_FAILURE); -} - -static bool check_swap(void *addr, unsigned long size) -{ - bool swap = false; - int ret; - FILE *fp; - char buffer[MAX_LINE_LENGTH]; - char addr_pattern[MAX_LINE_LENGTH]; - - ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", - (unsigned long) addr); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); - exit(EXIT_FAILURE); - } - - - fp = fopen(PID_SMAPS, "r"); - if (!fp) { - printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); - exit(EXIT_FAILURE); - } - if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) - goto err_out; - - ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", - size >> 10); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); - exit(EXIT_FAILURE); - } - /* - * Fetch the Swap: in the same block and check whether it got - * the expected number of hugeepages next. - */ - if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer))) - goto err_out; - - if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) - goto err_out; - - swap = true; -err_out: - fclose(fp); - return swap; -} - -static void *alloc_mapping(int nr) -{ - void *p; - - p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (p != BASE_ADDR) { - printf("Failed to allocate VMA at %p\n", BASE_ADDR); - exit(EXIT_FAILURE); - } - - return p; -} - -static void fill_memory(int *p, unsigned long start, unsigned long end) -{ - int i; - - for (i = start / page_size; i < end / page_size; i++) - p[i * page_size / sizeof(*p)] = i + 0xdead0000; -} - -/* - * MADV_COLLAPSE is a best-effort request and may fail if an internal - * resource is temporarily unavailable, in which case it will set errno to - * EAGAIN. In such a case, immediately reattempt the operation one more - * time. - */ -static int madvise_collapse_retry(void *p, unsigned long size) -{ - bool retry = true; - int ret; - -retry: - ret = madvise(p, size, MADV_COLLAPSE); - if (ret && errno == EAGAIN && retry) { - retry = false; - goto retry; - } - return ret; -} - -/* - * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with - * validate_memory()'able contents. - */ -static void *alloc_hpage(struct mem_ops *ops) -{ - void *p = ops->setup_area(1); - - ops->fault(p, 0, hpage_pmd_size); - - /* - * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE. - * The latter is ineligible for collapse by MADV_COLLAPSE - * while the former might cause MADV_COLLAPSE to race with - * khugepaged on low-load system (like a test machine), which - * would cause MADV_COLLAPSE to fail with EAGAIN. - */ - printf("Allocate huge page..."); - if (madvise_collapse_retry(p, hpage_pmd_size)) { - perror("madvise(MADV_COLLAPSE)"); - exit(EXIT_FAILURE); - } - if (!ops->check_huge(p, 1)) { - perror("madvise(MADV_COLLAPSE)"); - exit(EXIT_FAILURE); - } - if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) { - perror("madvise(MADV_HUGEPAGE)"); - exit(EXIT_FAILURE); - } - success("OK"); - return p; -} - -static void validate_memory(int *p, unsigned long start, unsigned long end) -{ - int i; - - for (i = start / page_size; i < end / page_size; i++) { - if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) { - printf("Page %d is corrupted: %#x\n", - i, p[i * page_size / sizeof(*p)]); - exit(EXIT_FAILURE); - } - } -} - -static void *anon_setup_area(int nr_hpages) -{ - return alloc_mapping(nr_hpages); -} - -static void anon_cleanup_area(void *p, unsigned long size) -{ - munmap(p, size); -} - -static void anon_fault(void *p, unsigned long start, unsigned long end) -{ - fill_memory(p, start, end); -} - -static bool anon_check_huge(void *addr, int nr_hpages) -{ - return check_huge_anon(addr, nr_hpages, hpage_pmd_size); -} - -static void *file_setup_area(int nr_hpages) -{ - int fd; - void *p; - unsigned long size; - - unlink(finfo.path); /* Cleanup from previous failed tests */ - printf("Creating %s for collapse%s...", finfo.path, - finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); - fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL, - 777); - if (fd < 0) { - perror("open()"); - exit(EXIT_FAILURE); - } - - size = nr_hpages * hpage_pmd_size; - p = alloc_mapping(nr_hpages); - fill_memory(p, 0, size); - write(fd, p, size); - close(fd); - munmap(p, size); - success("OK"); - - printf("Opening %s read only for collapse...", finfo.path); - finfo.fd = open(finfo.path, O_RDONLY, 777); - if (finfo.fd < 0) { - perror("open()"); - exit(EXIT_FAILURE); - } - p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC, - MAP_PRIVATE, finfo.fd, 0); - if (p == MAP_FAILED || p != BASE_ADDR) { - perror("mmap()"); - exit(EXIT_FAILURE); - } - - /* Drop page cache */ - write_file("/proc/sys/vm/drop_caches", "3", 2); - success("OK"); - return p; -} - -static void file_cleanup_area(void *p, unsigned long size) -{ - munmap(p, size); - close(finfo.fd); - unlink(finfo.path); -} - -static void file_fault(void *p, unsigned long start, unsigned long end) -{ - if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) { - perror("madvise(MADV_POPULATE_READ"); - exit(EXIT_FAILURE); - } -} - -static bool file_check_huge(void *addr, int nr_hpages) -{ - switch (finfo.type) { - case VMA_FILE: - return check_huge_file(addr, nr_hpages, hpage_pmd_size); - case VMA_SHMEM: - return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); - default: - exit(EXIT_FAILURE); - return false; - } -} - -static void *shmem_setup_area(int nr_hpages) -{ - void *p; - unsigned long size = nr_hpages * hpage_pmd_size; - - finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0); - if (finfo.fd < 0) { - perror("memfd_create()"); - exit(EXIT_FAILURE); - } - if (ftruncate(finfo.fd, size)) { - perror("ftruncate()"); - exit(EXIT_FAILURE); - } - p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd, - 0); - if (p != BASE_ADDR) { - perror("mmap()"); - exit(EXIT_FAILURE); - } - return p; -} - -static void shmem_cleanup_area(void *p, unsigned long size) -{ - munmap(p, size); - close(finfo.fd); -} - -static bool shmem_check_huge(void *addr, int nr_hpages) -{ - return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); -} - -static struct mem_ops __anon_ops = { - .setup_area = &anon_setup_area, - .cleanup_area = &anon_cleanup_area, - .fault = &anon_fault, - .check_huge = &anon_check_huge, - .name = "anon", -}; - -static struct mem_ops __file_ops = { - .setup_area = &file_setup_area, - .cleanup_area = &file_cleanup_area, - .fault = &file_fault, - .check_huge = &file_check_huge, - .name = "file", -}; - -static struct mem_ops __shmem_ops = { - .setup_area = &shmem_setup_area, - .cleanup_area = &shmem_cleanup_area, - .fault = &anon_fault, - .check_huge = &shmem_check_huge, - .name = "shmem", -}; - -static void __madvise_collapse(const char *msg, char *p, int nr_hpages, - struct mem_ops *ops, bool expect) -{ - int ret; - struct settings settings = *current_settings(); - - printf("%s...", msg); - - /* - * Prevent khugepaged interference and tests that MADV_COLLAPSE - * ignores /sys/kernel/mm/transparent_hugepage/enabled - */ - settings.thp_enabled = THP_NEVER; - settings.shmem_enabled = SHMEM_NEVER; - push_settings(&settings); - - /* Clear VM_NOHUGEPAGE */ - madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); - ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size); - if (((bool)ret) == expect) - fail("Fail: Bad return value"); - else if (!ops->check_huge(p, expect ? nr_hpages : 0)) - fail("Fail: check_huge()"); - else - success("OK"); - - pop_settings(); -} - -static void madvise_collapse(const char *msg, char *p, int nr_hpages, - struct mem_ops *ops, bool expect) -{ - /* Sanity check */ - if (!ops->check_huge(p, 0)) { - printf("Unexpected huge page\n"); - exit(EXIT_FAILURE); - } - __madvise_collapse(msg, p, nr_hpages, ops, expect); -} - -#define TICK 500000 -static bool wait_for_scan(const char *msg, char *p, int nr_hpages, - struct mem_ops *ops) -{ - int full_scans; - int timeout = 6; /* 3 seconds */ - - /* Sanity check */ - if (!ops->check_huge(p, 0)) { - printf("Unexpected huge page\n"); - exit(EXIT_FAILURE); - } - - madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); - - /* Wait until the second full_scan completed */ - full_scans = read_num("khugepaged/full_scans") + 2; - - printf("%s...", msg); - while (timeout--) { - if (ops->check_huge(p, nr_hpages)) - break; - if (read_num("khugepaged/full_scans") >= full_scans) - break; - printf("."); - usleep(TICK); - } - - madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE); - - return timeout == -1; -} - -static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, - struct mem_ops *ops, bool expect) -{ - if (wait_for_scan(msg, p, nr_hpages, ops)) { - if (expect) - fail("Timeout"); - else - success("OK"); - return; - } - - /* - * For file and shmem memory, khugepaged only retracts pte entries after - * putting the new hugepage in the page cache. The hugepage must be - * subsequently refaulted to install the pmd mapping for the mm. - */ - if (ops != &__anon_ops) - ops->fault(p, 0, nr_hpages * hpage_pmd_size); - - if (ops->check_huge(p, expect ? nr_hpages : 0)) - success("OK"); - else - fail("Fail"); -} - -static struct collapse_context __khugepaged_context = { - .collapse = &khugepaged_collapse, - .enforce_pte_scan_limits = true, - .name = "khugepaged", -}; - -static struct collapse_context __madvise_context = { - .collapse = &madvise_collapse, - .enforce_pte_scan_limits = false, - .name = "madvise", -}; - -static bool is_tmpfs(struct mem_ops *ops) -{ - return ops == &__file_ops && finfo.type == VMA_SHMEM; -} - -static void alloc_at_fault(void) -{ - struct settings settings = *current_settings(); - char *p; - - settings.thp_enabled = THP_ALWAYS; - push_settings(&settings); - - p = alloc_mapping(1); - *p = 1; - printf("Allocate huge page on fault..."); - if (check_huge_anon(p, 1, hpage_pmd_size)) - success("OK"); - else - fail("Fail"); - - pop_settings(); - - madvise(p, page_size, MADV_DONTNEED); - printf("Split huge PMD on MADV_DONTNEED..."); - if (check_huge_anon(p, 0, hpage_pmd_size)) - success("OK"); - else - fail("Fail"); - munmap(p, hpage_pmd_size); -} - -static void collapse_full(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - int nr_hpages = 4; - unsigned long size = nr_hpages * hpage_pmd_size; - - p = ops->setup_area(nr_hpages); - ops->fault(p, 0, size); - c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages, - ops, true); - validate_memory(p, 0, size); - ops->cleanup_area(p, size); -} - -static void collapse_empty(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - - p = ops->setup_area(1); - c->collapse("Do not collapse empty PTE table", p, 1, ops, false); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - - p = ops->setup_area(1); - ops->fault(p, 0, page_size); - c->collapse("Collapse PTE table with single PTE entry present", p, - 1, ops, true); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops) -{ - int max_ptes_none = hpage_pmd_nr / 2; - struct settings settings = *current_settings(); - void *p; - - settings.khugepaged.max_ptes_none = max_ptes_none; - push_settings(&settings); - - p = ops->setup_area(1); - - if (is_tmpfs(ops)) { - /* shmem pages always in the page cache */ - printf("tmpfs..."); - skip("Skip"); - goto skip; - } - - ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); - c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, - ops, !c->enforce_pte_scan_limits); - validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); - - if (c->enforce_pte_scan_limits) { - ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); - c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops, - true); - validate_memory(p, 0, - (hpage_pmd_nr - max_ptes_none) * page_size); - } -skip: - ops->cleanup_area(p, hpage_pmd_size); - pop_settings(); -} - -static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - - p = ops->setup_area(1); - ops->fault(p, 0, hpage_pmd_size); - - printf("Swapout one page..."); - if (madvise(p, page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } - if (check_swap(p, page_size)) { - success("OK"); - } else { - fail("Fail"); - goto out; - } - - c->collapse("Collapse with swapping in single PTE entry", p, 1, ops, - true); - validate_memory(p, 0, hpage_pmd_size); -out: - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops) -{ - int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); - void *p; - - p = ops->setup_area(1); - ops->fault(p, 0, hpage_pmd_size); - - printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); - if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } - if (check_swap(p, (max_ptes_swap + 1) * page_size)) { - success("OK"); - } else { - fail("Fail"); - goto out; - } - - c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops, - !c->enforce_pte_scan_limits); - validate_memory(p, 0, hpage_pmd_size); - - if (c->enforce_pte_scan_limits) { - ops->fault(p, 0, hpage_pmd_size); - printf("Swapout %d of %d pages...", max_ptes_swap, - hpage_pmd_nr); - if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } - if (check_swap(p, max_ptes_swap * page_size)) { - success("OK"); - } else { - fail("Fail"); - goto out; - } - - c->collapse("Collapse with max_ptes_swap pages swapped out", p, - 1, ops, true); - validate_memory(p, 0, hpage_pmd_size); - } -out: - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - - p = alloc_hpage(ops); - - if (is_tmpfs(ops)) { - /* MADV_DONTNEED won't evict tmpfs pages */ - printf("tmpfs..."); - skip("Skip"); - goto skip; - } - - madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - printf("Split huge page leaving single PTE mapping compound page..."); - madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - - c->collapse("Collapse PTE table with single PTE mapping compound page", - p, 1, ops, true); - validate_memory(p, 0, page_size); -skip: - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - - p = alloc_hpage(ops); - printf("Split huge page leaving single PTE page table full of compound pages..."); - madvise(p, page_size, MADV_NOHUGEPAGE); - madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - - c->collapse("Collapse PTE table full of compound pages", p, 1, ops, - true); - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - int i; - - p = ops->setup_area(1); - for (i = 0; i < hpage_pmd_nr; i++) { - printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", - i + 1, hpage_pmd_nr); - - madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); - ops->fault(BASE_ADDR, 0, hpage_pmd_size); - if (!ops->check_huge(BASE_ADDR, 1)) { - printf("Failed to allocate huge page\n"); - exit(EXIT_FAILURE); - } - madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE); - - p = mremap(BASE_ADDR - i * page_size, - i * page_size + hpage_pmd_size, - (i + 1) * page_size, - MREMAP_MAYMOVE | MREMAP_FIXED, - BASE_ADDR + 2 * hpage_pmd_size); - if (p == MAP_FAILED) { - perror("mremap+unmap"); - exit(EXIT_FAILURE); - } - - p = mremap(BASE_ADDR + 2 * hpage_pmd_size, - (i + 1) * page_size, - (i + 1) * page_size + hpage_pmd_size, - MREMAP_MAYMOVE | MREMAP_FIXED, - BASE_ADDR - (i + 1) * page_size); - if (p == MAP_FAILED) { - perror("mremap+alloc"); - exit(EXIT_FAILURE); - } - } - - ops->cleanup_area(BASE_ADDR, hpage_pmd_size); - ops->fault(p, 0, hpage_pmd_size); - if (!ops->check_huge(p, 1)) - success("OK"); - else - fail("Fail"); - - c->collapse("Collapse PTE table full of different compound pages", p, 1, - ops, true); - - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) -{ - int wstatus; - void *p; - - p = ops->setup_area(1); - - printf("Allocate small page..."); - ops->fault(p, 0, page_size); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - - printf("Share small page over fork()..."); - if (!fork()) { - /* Do not touch settings on child exit */ - skip_settings_restore = true; - exit_status = 0; - - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - - ops->fault(p, page_size, 2 * page_size); - c->collapse("Collapse PTE table with single page shared with parent process", - p, 1, ops, true); - - validate_memory(p, 0, page_size); - ops->cleanup_area(p, hpage_pmd_size); - exit(exit_status); - } - - wait(&wstatus); - exit_status += WEXITSTATUS(wstatus); - - printf("Check if parent still has small page..."); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, page_size); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops) -{ - int wstatus; - void *p; - - p = alloc_hpage(ops); - printf("Share huge page over fork()..."); - if (!fork()) { - /* Do not touch settings on child exit */ - skip_settings_restore = true; - exit_status = 0; - - if (ops->check_huge(p, 1)) - success("OK"); - else - fail("Fail"); - - printf("Split huge page PMD in child process..."); - madvise(p, page_size, MADV_NOHUGEPAGE); - madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - ops->fault(p, 0, page_size); - - write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); - c->collapse("Collapse PTE table full of compound pages in child", - p, 1, ops, true); - write_num("khugepaged/max_ptes_shared", - current_settings()->khugepaged.max_ptes_shared); - - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); - exit(exit_status); - } - - wait(&wstatus); - exit_status += WEXITSTATUS(wstatus); - - printf("Check if parent still has huge page..."); - if (ops->check_huge(p, 1)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops) -{ - int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); - int wstatus; - void *p; - - p = alloc_hpage(ops); - printf("Share huge page over fork()..."); - if (!fork()) { - /* Do not touch settings on child exit */ - skip_settings_restore = true; - exit_status = 0; - - if (ops->check_huge(p, 1)) - success("OK"); - else - fail("Fail"); - - printf("Trigger CoW on page %d of %d...", - hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); - ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - - c->collapse("Maybe collapse with max_ptes_shared exceeded", p, - 1, ops, !c->enforce_pte_scan_limits); - - if (c->enforce_pte_scan_limits) { - printf("Trigger CoW on page %d of %d...", - hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); - ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) * - page_size); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - - c->collapse("Collapse with max_ptes_shared PTEs shared", - p, 1, ops, true); - } - - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); - exit(exit_status); - } - - wait(&wstatus); - exit_status += WEXITSTATUS(wstatus); - - printf("Check if parent still has huge page..."); - if (ops->check_huge(p, 1)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void madvise_collapse_existing_thps(struct collapse_context *c, - struct mem_ops *ops) -{ - void *p; - - p = ops->setup_area(1); - ops->fault(p, 0, hpage_pmd_size); - c->collapse("Collapse fully populated PTE table...", p, 1, ops, true); - validate_memory(p, 0, hpage_pmd_size); - - /* c->collapse() will find a hugepage and complain - call directly. */ - __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true); - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); -} - -/* - * Test race with khugepaged where page tables have been retracted and - * pmd cleared. - */ -static void madvise_retracted_page_tables(struct collapse_context *c, - struct mem_ops *ops) -{ - void *p; - int nr_hpages = 1; - unsigned long size = nr_hpages * hpage_pmd_size; - - p = ops->setup_area(nr_hpages); - ops->fault(p, 0, size); - - /* Let khugepaged collapse and leave pmd cleared */ - if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages, - ops)) { - fail("Timeout"); - return; - } - success("OK"); - c->collapse("Install huge PMD from page cache", p, nr_hpages, ops, - true); - validate_memory(p, 0, size); - ops->cleanup_area(p, size); -} - -static void usage(void) -{ - fprintf(stderr, "\nUsage: ./khugepaged [dir]\n\n"); - fprintf(stderr, "\t\t: :\n"); - fprintf(stderr, "\t\t: [all|khugepaged|madvise]\n"); - fprintf(stderr, "\t\t: [all|anon|file|shmem]\n"); - fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); - fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); - fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); - fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); - fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n"); - exit(1); -} - -static void parse_test_type(int argc, const char **argv) -{ - char *buf; - const char *token; - - if (argc == 1) { - /* Backwards compatibility */ - khugepaged_context = &__khugepaged_context; - madvise_context = &__madvise_context; - anon_ops = &__anon_ops; - return; - } - - buf = strdup(argv[1]); - token = strsep(&buf, ":"); - - if (!strcmp(token, "all")) { - khugepaged_context = &__khugepaged_context; - madvise_context = &__madvise_context; - } else if (!strcmp(token, "khugepaged")) { - khugepaged_context = &__khugepaged_context; - } else if (!strcmp(token, "madvise")) { - madvise_context = &__madvise_context; - } else { - usage(); - } - - if (!buf) - usage(); - - if (!strcmp(buf, "all")) { - file_ops = &__file_ops; - anon_ops = &__anon_ops; - shmem_ops = &__shmem_ops; - } else if (!strcmp(buf, "anon")) { - anon_ops = &__anon_ops; - } else if (!strcmp(buf, "file")) { - file_ops = &__file_ops; - } else if (!strcmp(buf, "shmem")) { - shmem_ops = &__shmem_ops; - } else { - usage(); - } - - if (!file_ops) - return; - - if (argc != 3) - usage(); -} - -int main(int argc, const char **argv) -{ - struct settings default_settings = { - .thp_enabled = THP_MADVISE, - .thp_defrag = THP_DEFRAG_ALWAYS, - .shmem_enabled = SHMEM_ADVISE, - .use_zero_page = 0, - .khugepaged = { - .defrag = 1, - .alloc_sleep_millisecs = 10, - .scan_sleep_millisecs = 10, - }, - /* - * When testing file-backed memory, the collapse path - * looks at how many pages are found in the page cache, not - * what pages are mapped. Disable read ahead optimization so - * pages don't find their way into the page cache unless - * we mem_ops->fault() them in. - */ - .read_ahead_kb = 0, - }; - - parse_test_type(argc, argv); - - if (file_ops) - get_finfo(argv[2]); - - setbuf(stdout, NULL); - - page_size = getpagesize(); - hpage_pmd_size = read_pmd_pagesize(); - hpage_pmd_nr = hpage_pmd_size / page_size; - - default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; - default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8; - default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; - default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; - - save_settings(); - push_settings(&default_settings); - - alloc_at_fault(); - -#define TEST(t, c, o) do { \ - if (c && o) { \ - printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \ - t(c, o); \ - } \ - } while (0) - - TEST(collapse_full, khugepaged_context, anon_ops); - TEST(collapse_full, khugepaged_context, file_ops); - TEST(collapse_full, khugepaged_context, shmem_ops); - TEST(collapse_full, madvise_context, anon_ops); - TEST(collapse_full, madvise_context, file_ops); - TEST(collapse_full, madvise_context, shmem_ops); - - TEST(collapse_empty, khugepaged_context, anon_ops); - TEST(collapse_empty, madvise_context, anon_ops); - - TEST(collapse_single_pte_entry, khugepaged_context, anon_ops); - TEST(collapse_single_pte_entry, khugepaged_context, file_ops); - TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops); - TEST(collapse_single_pte_entry, madvise_context, anon_ops); - TEST(collapse_single_pte_entry, madvise_context, file_ops); - TEST(collapse_single_pte_entry, madvise_context, shmem_ops); - - TEST(collapse_max_ptes_none, khugepaged_context, anon_ops); - TEST(collapse_max_ptes_none, khugepaged_context, file_ops); - TEST(collapse_max_ptes_none, madvise_context, anon_ops); - TEST(collapse_max_ptes_none, madvise_context, file_ops); - - TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops); - TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops); - TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops); - TEST(collapse_single_pte_entry_compound, madvise_context, file_ops); - - TEST(collapse_full_of_compound, khugepaged_context, anon_ops); - TEST(collapse_full_of_compound, khugepaged_context, file_ops); - TEST(collapse_full_of_compound, khugepaged_context, shmem_ops); - TEST(collapse_full_of_compound, madvise_context, anon_ops); - TEST(collapse_full_of_compound, madvise_context, file_ops); - TEST(collapse_full_of_compound, madvise_context, shmem_ops); - - TEST(collapse_compound_extreme, khugepaged_context, anon_ops); - TEST(collapse_compound_extreme, madvise_context, anon_ops); - - TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops); - TEST(collapse_swapin_single_pte, madvise_context, anon_ops); - - TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops); - TEST(collapse_max_ptes_swap, madvise_context, anon_ops); - - TEST(collapse_fork, khugepaged_context, anon_ops); - TEST(collapse_fork, madvise_context, anon_ops); - - TEST(collapse_fork_compound, khugepaged_context, anon_ops); - TEST(collapse_fork_compound, madvise_context, anon_ops); - - TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops); - TEST(collapse_max_ptes_shared, madvise_context, anon_ops); - - TEST(madvise_collapse_existing_thps, madvise_context, anon_ops); - TEST(madvise_collapse_existing_thps, madvise_context, file_ops); - TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops); - - TEST(madvise_retracted_page_tables, madvise_context, file_ops); - TEST(madvise_retracted_page_tables, madvise_context, shmem_ops); - - restore_settings(0); -} diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/vm/ksm_functional_tests.c deleted file mode 100644 index d8b5b4930412..000000000000 --- a/tools/testing/selftests/vm/ksm_functional_tests.c +++ /dev/null @@ -1,279 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * KSM functional tests - * - * Copyright 2022, Red Hat, Inc. - * - * Author(s): David Hildenbrand - */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" -#include "vm_util.h" - -#define KiB 1024u -#define MiB (1024 * KiB) - -static int ksm_fd; -static int ksm_full_scans_fd; -static int pagemap_fd; -static size_t pagesize; - -static bool range_maps_duplicates(char *addr, unsigned long size) -{ - unsigned long offs_a, offs_b, pfn_a, pfn_b; - - /* - * There is no easy way to check if there are KSM pages mapped into - * this range. We only check that the range does not map the same PFN - * twice by comparing each pair of mapped pages. - */ - for (offs_a = 0; offs_a < size; offs_a += pagesize) { - pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a); - /* Page not present or PFN not exposed by the kernel. */ - if (pfn_a == -1ul || !pfn_a) - continue; - - for (offs_b = offs_a + pagesize; offs_b < size; - offs_b += pagesize) { - pfn_b = pagemap_get_pfn(pagemap_fd, addr + offs_b); - if (pfn_b == -1ul || !pfn_b) - continue; - if (pfn_a == pfn_b) - return true; - } - } - return false; -} - -static long ksm_get_full_scans(void) -{ - char buf[10]; - ssize_t ret; - - ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0); - if (ret <= 0) - return -errno; - buf[ret] = 0; - - return strtol(buf, NULL, 10); -} - -static int ksm_merge(void) -{ - long start_scans, end_scans; - - /* Wait for two full scans such that any possible merging happened. */ - start_scans = ksm_get_full_scans(); - if (start_scans < 0) - return start_scans; - if (write(ksm_fd, "1", 1) != 1) - return -errno; - do { - end_scans = ksm_get_full_scans(); - if (end_scans < 0) - return end_scans; - } while (end_scans < start_scans + 2); - - return 0; -} - -static char *mmap_and_merge_range(char val, unsigned long size) -{ - char *map; - - map = mmap(NULL, size, PROT_READ|PROT_WRITE, - MAP_PRIVATE|MAP_ANON, -1, 0); - if (map == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return MAP_FAILED; - } - - /* Don't use THP. Ignore if THP are not around on a kernel. */ - if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) { - ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); - goto unmap; - } - - /* Make sure each page contains the same values to merge them. */ - memset(map, val, size); - if (madvise(map, size, MADV_MERGEABLE)) { - ksft_test_result_fail("MADV_MERGEABLE failed\n"); - goto unmap; - } - - /* Run KSM to trigger merging and wait. */ - if (ksm_merge()) { - ksft_test_result_fail("Running KSM failed\n"); - goto unmap; - } - return map; -unmap: - munmap(map, size); - return MAP_FAILED; -} - -static void test_unmerge(void) -{ - const unsigned int size = 2 * MiB; - char *map; - - ksft_print_msg("[RUN] %s\n", __func__); - - map = mmap_and_merge_range(0xcf, size); - if (map == MAP_FAILED) - return; - - if (madvise(map, size, MADV_UNMERGEABLE)) { - ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); - goto unmap; - } - - ksft_test_result(!range_maps_duplicates(map, size), - "Pages were unmerged\n"); -unmap: - munmap(map, size); -} - -static void test_unmerge_discarded(void) -{ - const unsigned int size = 2 * MiB; - char *map; - - ksft_print_msg("[RUN] %s\n", __func__); - - map = mmap_and_merge_range(0xcf, size); - if (map == MAP_FAILED) - return; - - /* Discard half of all mapped pages so we have pte_none() entries. */ - if (madvise(map, size / 2, MADV_DONTNEED)) { - ksft_test_result_fail("MADV_DONTNEED failed\n"); - goto unmap; - } - - if (madvise(map, size, MADV_UNMERGEABLE)) { - ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); - goto unmap; - } - - ksft_test_result(!range_maps_duplicates(map, size), - "Pages were unmerged\n"); -unmap: - munmap(map, size); -} - -#ifdef __NR_userfaultfd -static void test_unmerge_uffd_wp(void) -{ - struct uffdio_writeprotect uffd_writeprotect; - struct uffdio_register uffdio_register; - const unsigned int size = 2 * MiB; - struct uffdio_api uffdio_api; - char *map; - int uffd; - - ksft_print_msg("[RUN] %s\n", __func__); - - map = mmap_and_merge_range(0xcf, size); - if (map == MAP_FAILED) - return; - - /* See if UFFD is around. */ - uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); - if (uffd < 0) { - ksft_test_result_skip("__NR_userfaultfd failed\n"); - goto unmap; - } - - /* See if UFFD-WP is around. */ - uffdio_api.api = UFFD_API; - uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP; - if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) { - ksft_test_result_fail("UFFDIO_API failed\n"); - goto close_uffd; - } - if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) { - ksft_test_result_skip("UFFD_FEATURE_PAGEFAULT_FLAG_WP not available\n"); - goto close_uffd; - } - - /* Register UFFD-WP, no need for an actual handler. */ - uffdio_register.range.start = (unsigned long) map; - uffdio_register.range.len = size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) { - ksft_test_result_fail("UFFDIO_REGISTER_MODE_WP failed\n"); - goto close_uffd; - } - - /* Write-protect the range using UFFD-WP. */ - uffd_writeprotect.range.start = (unsigned long) map; - uffd_writeprotect.range.len = size; - uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP; - if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { - ksft_test_result_fail("UFFDIO_WRITEPROTECT failed\n"); - goto close_uffd; - } - - if (madvise(map, size, MADV_UNMERGEABLE)) { - ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); - goto close_uffd; - } - - ksft_test_result(!range_maps_duplicates(map, size), - "Pages were unmerged\n"); -close_uffd: - close(uffd); -unmap: - munmap(map, size); -} -#endif - -int main(int argc, char **argv) -{ - unsigned int tests = 2; - int err; - -#ifdef __NR_userfaultfd - tests++; -#endif - - ksft_print_header(); - ksft_set_plan(tests); - - pagesize = getpagesize(); - - ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); - if (ksm_fd < 0) - ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n"); - ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY); - if (ksm_full_scans_fd < 0) - ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n"); - pagemap_fd = open("/proc/self/pagemap", O_RDONLY); - if (pagemap_fd < 0) - ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); - - test_unmerge(); - test_unmerge_discarded(); -#ifdef __NR_userfaultfd - test_unmerge_uffd_wp(); -#endif - - err = ksft_get_fail_cnt(); - if (err) - ksft_exit_fail_msg("%d out of %d tests failed\n", - err, ksft_test_num()); - return ksft_exit_pass(); -} diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c deleted file mode 100644 index f9eb4d67e0dd..000000000000 --- a/tools/testing/selftests/vm/ksm_tests.c +++ /dev/null @@ -1,849 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" -#include -#include "util.h" - -#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/" -#define KSM_FP(s) (KSM_SYSFS_PATH s) -#define KSM_SCAN_LIMIT_SEC_DEFAULT 120 -#define KSM_PAGE_COUNT_DEFAULT 10l -#define KSM_PROT_STR_DEFAULT "rw" -#define KSM_USE_ZERO_PAGES_DEFAULT false -#define KSM_MERGE_ACROSS_NODES_DEFAULT true -#define MB (1ul << 20) - -struct ksm_sysfs { - unsigned long max_page_sharing; - unsigned long merge_across_nodes; - unsigned long pages_to_scan; - unsigned long run; - unsigned long sleep_millisecs; - unsigned long stable_node_chains_prune_millisecs; - unsigned long use_zero_pages; -}; - -enum ksm_test_name { - CHECK_KSM_MERGE, - CHECK_KSM_UNMERGE, - CHECK_KSM_ZERO_PAGE_MERGE, - CHECK_KSM_NUMA_MERGE, - KSM_MERGE_TIME, - KSM_MERGE_TIME_HUGE_PAGES, - KSM_UNMERGE_TIME, - KSM_COW_TIME -}; - -static int ksm_write_sysfs(const char *file_path, unsigned long val) -{ - FILE *f = fopen(file_path, "w"); - - if (!f) { - fprintf(stderr, "f %s\n", file_path); - perror("fopen"); - return 1; - } - if (fprintf(f, "%lu", val) < 0) { - perror("fprintf"); - fclose(f); - return 1; - } - fclose(f); - - return 0; -} - -static int ksm_read_sysfs(const char *file_path, unsigned long *val) -{ - FILE *f = fopen(file_path, "r"); - - if (!f) { - fprintf(stderr, "f %s\n", file_path); - perror("fopen"); - return 1; - } - if (fscanf(f, "%lu", val) != 1) { - perror("fscanf"); - fclose(f); - return 1; - } - fclose(f); - - return 0; -} - -static int str_to_prot(char *prot_str) -{ - int prot = 0; - - if ((strchr(prot_str, 'r')) != NULL) - prot |= PROT_READ; - if ((strchr(prot_str, 'w')) != NULL) - prot |= PROT_WRITE; - if ((strchr(prot_str, 'x')) != NULL) - prot |= PROT_EXEC; - - return prot; -} - -static void print_help(void) -{ - printf("usage: ksm_tests [-h] [-a prot] [-p page_count] [-l timeout]\n" - "[-z use_zero_pages] [-m merge_across_nodes] [-s size]\n"); - - printf("Supported :\n" - " -M (page merging)\n" - " -Z (zero pages merging)\n" - " -N (merging of pages in different NUMA nodes)\n" - " -U (page unmerging)\n" - " -P evaluate merging time and speed.\n" - " For this test, the size of duplicated memory area (in MiB)\n" - " must be provided using -s option\n" - " -H evaluate merging time and speed of area allocated mostly with huge pages\n" - " For this test, the size of duplicated memory area (in MiB)\n" - " must be provided using -s option\n" - " -D evaluate unmerging time and speed when disabling KSM.\n" - " For this test, the size of duplicated memory area (in MiB)\n" - " must be provided using -s option\n" - " -C evaluate the time required to break COW of merged pages.\n\n"); - - printf(" -a: specify the access protections of pages.\n" - " must be of the form [rwx].\n" - " Default: %s\n", KSM_PROT_STR_DEFAULT); - printf(" -p: specify the number of pages to test.\n" - " Default: %ld\n", KSM_PAGE_COUNT_DEFAULT); - printf(" -l: limit the maximum running time (in seconds) for a test.\n" - " Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT); - printf(" -z: change use_zero_pages tunable\n" - " Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT); - printf(" -m: change merge_across_nodes tunable\n" - " Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT); - printf(" -s: the size of duplicated memory area (in MiB)\n"); - - exit(0); -} - -static void *allocate_memory(void *ptr, int prot, int mapping, char data, size_t map_size) -{ - void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0); - - if (!map_ptr) { - perror("mmap"); - return NULL; - } - memset(map_ptr, data, map_size); - if (mprotect(map_ptr, map_size, prot)) { - perror("mprotect"); - munmap(map_ptr, map_size); - return NULL; - } - - return map_ptr; -} - -static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout) -{ - struct timespec cur_time; - unsigned long cur_scan, init_scan; - - if (ksm_read_sysfs(KSM_FP("full_scans"), &init_scan)) - return 1; - cur_scan = init_scan; - - while (cur_scan < init_scan + scan_count) { - if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan)) - return 1; - if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) { - perror("clock_gettime"); - return 1; - } - if ((cur_time.tv_sec - start_time.tv_sec) > timeout) { - printf("Scan time limit exceeded\n"); - return 1; - } - } - - return 0; -} - -static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, int timeout) -{ - if (madvise(addr, size, MADV_MERGEABLE)) { - perror("madvise"); - return 1; - } - if (ksm_write_sysfs(KSM_FP("run"), 1)) - return 1; - - /* Since merging occurs only after 2 scans, make sure to get at least 2 full scans */ - if (ksm_do_scan(2, start_time, timeout)) - return 1; - - return 0; -} - -static int ksm_unmerge_pages(void *addr, size_t size, - struct timespec start_time, int timeout) -{ - if (madvise(addr, size, MADV_UNMERGEABLE)) { - perror("madvise"); - return 1; - } - return 0; -} - -static bool assert_ksm_pages_count(long dupl_page_count) -{ - unsigned long max_page_sharing, pages_sharing, pages_shared; - - if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) || - ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) || - ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing)) - return false; - - /* - * Since there must be at least 2 pages for merging and 1 page can be - * shared with the limited number of pages (max_page_sharing), sometimes - * there are 'leftover' pages that cannot be merged. For example, if there - * are 11 pages and max_page_sharing = 10, then only 10 pages will be - * merged and the 11th page won't be affected. As a result, when the number - * of duplicate pages is divided by max_page_sharing and the remainder is 1, - * pages_shared and pages_sharing values will be equal between dupl_page_count - * and dupl_page_count - 1. - */ - if (dupl_page_count % max_page_sharing == 1 || dupl_page_count % max_page_sharing == 0) { - if (pages_shared == dupl_page_count / max_page_sharing && - pages_sharing == pages_shared * (max_page_sharing - 1)) - return true; - } else { - if (pages_shared == (dupl_page_count / max_page_sharing + 1) && - pages_sharing == dupl_page_count - pages_shared) - return true; - } - - return false; -} - -static int ksm_save_def(struct ksm_sysfs *ksm_sysfs) -{ - if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) || - numa_available() ? 0 : - ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) || - ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) || - ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) || - ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) || - ksm_read_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), - &ksm_sysfs->stable_node_chains_prune_millisecs) || - ksm_read_sysfs(KSM_FP("use_zero_pages"), &ksm_sysfs->use_zero_pages)) - return 1; - - return 0; -} - -static int ksm_restore(struct ksm_sysfs *ksm_sysfs) -{ - if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) || - numa_available() ? 0 : - ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) || - ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) || - ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) || - ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) || - ksm_write_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), - ksm_sysfs->stable_node_chains_prune_millisecs) || - ksm_write_sysfs(KSM_FP("use_zero_pages"), ksm_sysfs->use_zero_pages)) - return 1; - - return 0; -} - -static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, size_t page_size) -{ - void *map_ptr; - struct timespec start_time; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - return KSFT_FAIL; - } - - /* fill pages with the same data and merge them */ - map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); - if (!map_ptr) - return KSFT_FAIL; - - if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) - goto err_out; - - /* verify that the right number of pages are merged */ - if (assert_ksm_pages_count(page_count)) { - printf("OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_PASS; - } - -err_out: - printf("Not OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_FAIL; -} - -static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_size) -{ - void *map_ptr; - struct timespec start_time; - int page_count = 2; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - return KSFT_FAIL; - } - - /* fill pages with the same data and merge them */ - map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); - if (!map_ptr) - return KSFT_FAIL; - - if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) - goto err_out; - - /* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */ - memset(map_ptr, '-', 1); - memset(map_ptr + page_size, '+', 1); - - /* get at least 1 scan, so KSM can detect that the pages were modified */ - if (ksm_do_scan(1, start_time, timeout)) - goto err_out; - - /* check that unmerging was successful and 0 pages are currently merged */ - if (assert_ksm_pages_count(0)) { - printf("OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_PASS; - } - -err_out: - printf("Not OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_FAIL; -} - -static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int timeout, - bool use_zero_pages, size_t page_size) -{ - void *map_ptr; - struct timespec start_time; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - return KSFT_FAIL; - } - - if (ksm_write_sysfs(KSM_FP("use_zero_pages"), use_zero_pages)) - return KSFT_FAIL; - - /* fill pages with zero and try to merge them */ - map_ptr = allocate_memory(NULL, prot, mapping, 0, page_size * page_count); - if (!map_ptr) - return KSFT_FAIL; - - if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) - goto err_out; - - /* - * verify that the right number of pages are merged: - * 1) if use_zero_pages is set to 1, empty pages are merged - * with the kernel zero page instead of with each other; - * 2) if use_zero_pages is set to 0, empty pages are not treated specially - * and merged as usual. - */ - if (use_zero_pages && !assert_ksm_pages_count(0)) - goto err_out; - else if (!use_zero_pages && !assert_ksm_pages_count(page_count)) - goto err_out; - - printf("OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_PASS; - -err_out: - printf("Not OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_FAIL; -} - -static int get_next_mem_node(int node) -{ - - long node_size; - int mem_node = 0; - int i, max_node = numa_max_node(); - - for (i = node + 1; i <= max_node + node; i++) { - mem_node = i % (max_node + 1); - node_size = numa_node_size(mem_node, NULL); - if (node_size > 0) - break; - } - return mem_node; -} - -static int get_first_mem_node(void) -{ - return get_next_mem_node(numa_max_node()); -} - -static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes, - size_t page_size) -{ - void *numa1_map_ptr, *numa2_map_ptr; - struct timespec start_time; - int page_count = 2; - int first_node; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - return KSFT_FAIL; - } - - if (numa_available() < 0) { - perror("NUMA support not enabled"); - return KSFT_SKIP; - } - if (numa_num_configured_nodes() <= 1) { - printf("At least 2 NUMA nodes must be available\n"); - return KSFT_SKIP; - } - if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes)) - return KSFT_FAIL; - - /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */ - first_node = get_first_mem_node(); - numa1_map_ptr = numa_alloc_onnode(page_size, first_node); - numa2_map_ptr = numa_alloc_onnode(page_size, get_next_mem_node(first_node)); - if (!numa1_map_ptr || !numa2_map_ptr) { - perror("numa_alloc_onnode"); - return KSFT_FAIL; - } - - memset(numa1_map_ptr, '*', page_size); - memset(numa2_map_ptr, '*', page_size); - - /* try to merge the pages */ - if (ksm_merge_pages(numa1_map_ptr, page_size, start_time, timeout) || - ksm_merge_pages(numa2_map_ptr, page_size, start_time, timeout)) - goto err_out; - - /* - * verify that the right number of pages are merged: - * 1) if merge_across_nodes was enabled, 2 duplicate pages will be merged; - * 2) if merge_across_nodes = 0, there must be 0 merged pages, since there is - * only 1 unique page in each node and they can't be shared. - */ - if (merge_across_nodes && !assert_ksm_pages_count(page_count)) - goto err_out; - else if (!merge_across_nodes && !assert_ksm_pages_count(0)) - goto err_out; - - numa_free(numa1_map_ptr, page_size); - numa_free(numa2_map_ptr, page_size); - printf("OK\n"); - return KSFT_PASS; - -err_out: - numa_free(numa1_map_ptr, page_size); - numa_free(numa2_map_ptr, page_size); - printf("Not OK\n"); - return KSFT_FAIL; -} - -static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t map_size) -{ - void *map_ptr, *map_ptr_orig; - struct timespec start_time, end_time; - unsigned long scan_time_ns; - int pagemap_fd, n_normal_pages, n_huge_pages; - - map_size *= MB; - size_t len = map_size; - - len -= len % HPAGE_SIZE; - map_ptr_orig = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0); - map_ptr = map_ptr_orig + HPAGE_SIZE - (uintptr_t)map_ptr_orig % HPAGE_SIZE; - - if (map_ptr_orig == MAP_FAILED) - err(2, "initial mmap"); - - if (madvise(map_ptr, len + HPAGE_SIZE, MADV_HUGEPAGE)) - err(2, "MADV_HUGEPAGE"); - - pagemap_fd = open("/proc/self/pagemap", O_RDONLY); - if (pagemap_fd < 0) - err(2, "open pagemap"); - - n_normal_pages = 0; - n_huge_pages = 0; - for (void *p = map_ptr; p < map_ptr + len; p += HPAGE_SIZE) { - if (allocate_transhuge(p, pagemap_fd) < 0) - n_normal_pages++; - else - n_huge_pages++; - } - printf("Number of normal pages: %d\n", n_normal_pages); - printf("Number of huge pages: %d\n", n_huge_pages); - - memset(map_ptr, '*', len); - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - goto err_out; - } - if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) - goto err_out; - if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); - goto err_out; - } - - scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + - (end_time.tv_nsec - start_time.tv_nsec); - - printf("Total size: %lu MiB\n", map_size / MB); - printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, - scan_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n", (map_size / MB) / - ((double)scan_time_ns / NSEC_PER_SEC)); - - munmap(map_ptr_orig, len + HPAGE_SIZE); - return KSFT_PASS; - -err_out: - printf("Not OK\n"); - munmap(map_ptr_orig, len + HPAGE_SIZE); - return KSFT_FAIL; -} - -static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size) -{ - void *map_ptr; - struct timespec start_time, end_time; - unsigned long scan_time_ns; - - map_size *= MB; - - map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); - if (!map_ptr) - return KSFT_FAIL; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - goto err_out; - } - if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) - goto err_out; - if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); - goto err_out; - } - - scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + - (end_time.tv_nsec - start_time.tv_nsec); - - printf("Total size: %lu MiB\n", map_size / MB); - printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, - scan_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n", (map_size / MB) / - ((double)scan_time_ns / NSEC_PER_SEC)); - - munmap(map_ptr, map_size); - return KSFT_PASS; - -err_out: - printf("Not OK\n"); - munmap(map_ptr, map_size); - return KSFT_FAIL; -} - -static int ksm_unmerge_time(int mapping, int prot, int timeout, size_t map_size) -{ - void *map_ptr; - struct timespec start_time, end_time; - unsigned long scan_time_ns; - - map_size *= MB; - - map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); - if (!map_ptr) - return KSFT_FAIL; - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - goto err_out; - } - if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) - goto err_out; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - goto err_out; - } - if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout)) - goto err_out; - if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); - goto err_out; - } - - scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + - (end_time.tv_nsec - start_time.tv_nsec); - - printf("Total size: %lu MiB\n", map_size / MB); - printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, - scan_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n", (map_size / MB) / - ((double)scan_time_ns / NSEC_PER_SEC)); - - munmap(map_ptr, map_size); - return KSFT_PASS; - -err_out: - printf("Not OK\n"); - munmap(map_ptr, map_size); - return KSFT_FAIL; -} - -static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size) -{ - void *map_ptr; - struct timespec start_time, end_time; - unsigned long cow_time_ns; - - /* page_count must be less than 2*page_size */ - size_t page_count = 4000; - - map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); - if (!map_ptr) - return KSFT_FAIL; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - return KSFT_FAIL; - } - for (size_t i = 0; i < page_count - 1; i = i + 2) - memset(map_ptr + page_size * i, '-', 1); - if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); - return KSFT_FAIL; - } - - cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + - (end_time.tv_nsec - start_time.tv_nsec); - - printf("Total size: %lu MiB\n\n", (page_size * page_count) / MB); - printf("Not merged pages:\n"); - printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, - cow_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) / - ((double)cow_time_ns / NSEC_PER_SEC)); - - /* Create 2000 pairs of duplicate pages */ - for (size_t i = 0; i < page_count - 1; i = i + 2) { - memset(map_ptr + page_size * i, '+', i / 2 + 1); - memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1); - } - if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) - goto err_out; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - goto err_out; - } - for (size_t i = 0; i < page_count - 1; i = i + 2) - memset(map_ptr + page_size * i, '-', 1); - if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); - goto err_out; - } - - cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + - (end_time.tv_nsec - start_time.tv_nsec); - - printf("Merged pages:\n"); - printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, - cow_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) / - ((double)cow_time_ns / NSEC_PER_SEC)); - - munmap(map_ptr, page_size * page_count); - return KSFT_PASS; - -err_out: - printf("Not OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_FAIL; -} - -int main(int argc, char *argv[]) -{ - int ret, opt; - int prot = 0; - int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT; - long page_count = KSM_PAGE_COUNT_DEFAULT; - size_t page_size = sysconf(_SC_PAGESIZE); - struct ksm_sysfs ksm_sysfs_old; - int test_name = CHECK_KSM_MERGE; - bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT; - bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; - long size_MB = 0; - - while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCHD")) != -1) { - switch (opt) { - case 'a': - prot = str_to_prot(optarg); - break; - case 'p': - page_count = atol(optarg); - if (page_count <= 0) { - printf("The number of pages must be greater than 0\n"); - return KSFT_FAIL; - } - break; - case 'l': - ksm_scan_limit_sec = atoi(optarg); - if (ksm_scan_limit_sec <= 0) { - printf("Timeout value must be greater than 0\n"); - return KSFT_FAIL; - } - break; - case 'h': - print_help(); - break; - case 'z': - if (strcmp(optarg, "0") == 0) - use_zero_pages = 0; - else - use_zero_pages = 1; - break; - case 'm': - if (strcmp(optarg, "0") == 0) - merge_across_nodes = 0; - else - merge_across_nodes = 1; - break; - case 's': - size_MB = atoi(optarg); - if (size_MB <= 0) { - printf("Size must be greater than 0\n"); - return KSFT_FAIL; - } - case 'M': - break; - case 'U': - test_name = CHECK_KSM_UNMERGE; - break; - case 'Z': - test_name = CHECK_KSM_ZERO_PAGE_MERGE; - break; - case 'N': - test_name = CHECK_KSM_NUMA_MERGE; - break; - case 'P': - test_name = KSM_MERGE_TIME; - break; - case 'H': - test_name = KSM_MERGE_TIME_HUGE_PAGES; - break; - case 'D': - test_name = KSM_UNMERGE_TIME; - break; - case 'C': - test_name = KSM_COW_TIME; - break; - default: - return KSFT_FAIL; - } - } - - if (prot == 0) - prot = str_to_prot(KSM_PROT_STR_DEFAULT); - - if (access(KSM_SYSFS_PATH, F_OK)) { - printf("Config KSM not enabled\n"); - return KSFT_SKIP; - } - - if (ksm_save_def(&ksm_sysfs_old)) { - printf("Cannot save default tunables\n"); - return KSFT_FAIL; - } - - if (ksm_write_sysfs(KSM_FP("run"), 2) || - ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) || - numa_available() ? 0 : - ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) || - ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count)) - return KSFT_FAIL; - - switch (test_name) { - case CHECK_KSM_MERGE: - ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, - ksm_scan_limit_sec, page_size); - break; - case CHECK_KSM_UNMERGE: - ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, - page_size); - break; - case CHECK_KSM_ZERO_PAGE_MERGE: - ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, - ksm_scan_limit_sec, use_zero_pages, page_size); - break; - case CHECK_KSM_NUMA_MERGE: - ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, - merge_across_nodes, page_size); - break; - case KSM_MERGE_TIME: - if (size_MB == 0) { - printf("Option '-s' is required.\n"); - return KSFT_FAIL; - } - ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, - size_MB); - break; - case KSM_MERGE_TIME_HUGE_PAGES: - if (size_MB == 0) { - printf("Option '-s' is required.\n"); - return KSFT_FAIL; - } - ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, - ksm_scan_limit_sec, size_MB); - break; - case KSM_UNMERGE_TIME: - if (size_MB == 0) { - printf("Option '-s' is required.\n"); - return KSFT_FAIL; - } - ret = ksm_unmerge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, - ksm_scan_limit_sec, size_MB); - break; - case KSM_COW_TIME: - ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, - page_size); - break; - } - - if (ksm_restore(&ksm_sysfs_old)) { - printf("Cannot restore default tunables\n"); - return KSFT_FAIL; - } - - return ret; -} diff --git a/tools/testing/selftests/vm/madv_populate.c b/tools/testing/selftests/vm/madv_populate.c deleted file mode 100644 index 262eae6b58f2..000000000000 --- a/tools/testing/selftests/vm/madv_populate.c +++ /dev/null @@ -1,296 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * MADV_POPULATE_READ and MADV_POPULATE_WRITE tests - * - * Copyright 2021, Red Hat, Inc. - * - * Author(s): David Hildenbrand - */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" -#include "vm_util.h" - -#ifndef MADV_POPULATE_READ -#define MADV_POPULATE_READ 22 -#endif /* MADV_POPULATE_READ */ -#ifndef MADV_POPULATE_WRITE -#define MADV_POPULATE_WRITE 23 -#endif /* MADV_POPULATE_WRITE */ - -/* - * For now, we're using 2 MiB of private anonymous memory for all tests. - */ -#define SIZE (2 * 1024 * 1024) - -static size_t pagesize; - -static void sense_support(void) -{ - char *addr; - int ret; - - addr = mmap(0, pagesize, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (!addr) - ksft_exit_fail_msg("mmap failed\n"); - - ret = madvise(addr, pagesize, MADV_POPULATE_READ); - if (ret) - ksft_exit_skip("MADV_POPULATE_READ is not available\n"); - - ret = madvise(addr, pagesize, MADV_POPULATE_WRITE); - if (ret) - ksft_exit_skip("MADV_POPULATE_WRITE is not available\n"); - - munmap(addr, pagesize); -} - -static void test_prot_read(void) -{ - char *addr; - int ret; - - ksft_print_msg("[RUN] %s\n", __func__); - - addr = mmap(0, SIZE, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap failed\n"); - - ret = madvise(addr, SIZE, MADV_POPULATE_READ); - ksft_test_result(!ret, "MADV_POPULATE_READ with PROT_READ\n"); - - ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); - ksft_test_result(ret == -1 && errno == EINVAL, - "MADV_POPULATE_WRITE with PROT_READ\n"); - - munmap(addr, SIZE); -} - -static void test_prot_write(void) -{ - char *addr; - int ret; - - ksft_print_msg("[RUN] %s\n", __func__); - - addr = mmap(0, SIZE, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap failed\n"); - - ret = madvise(addr, SIZE, MADV_POPULATE_READ); - ksft_test_result(ret == -1 && errno == EINVAL, - "MADV_POPULATE_READ with PROT_WRITE\n"); - - ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); - ksft_test_result(!ret, "MADV_POPULATE_WRITE with PROT_WRITE\n"); - - munmap(addr, SIZE); -} - -static void test_holes(void) -{ - char *addr; - int ret; - - ksft_print_msg("[RUN] %s\n", __func__); - - addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap failed\n"); - ret = munmap(addr + pagesize, pagesize); - if (ret) - ksft_exit_fail_msg("munmap failed\n"); - - /* Hole in the middle */ - ret = madvise(addr, SIZE, MADV_POPULATE_READ); - ksft_test_result(ret == -1 && errno == ENOMEM, - "MADV_POPULATE_READ with holes in the middle\n"); - ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); - ksft_test_result(ret == -1 && errno == ENOMEM, - "MADV_POPULATE_WRITE with holes in the middle\n"); - - /* Hole at end */ - ret = madvise(addr, 2 * pagesize, MADV_POPULATE_READ); - ksft_test_result(ret == -1 && errno == ENOMEM, - "MADV_POPULATE_READ with holes at the end\n"); - ret = madvise(addr, 2 * pagesize, MADV_POPULATE_WRITE); - ksft_test_result(ret == -1 && errno == ENOMEM, - "MADV_POPULATE_WRITE with holes at the end\n"); - - /* Hole at beginning */ - ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_READ); - ksft_test_result(ret == -1 && errno == ENOMEM, - "MADV_POPULATE_READ with holes at the beginning\n"); - ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_WRITE); - ksft_test_result(ret == -1 && errno == ENOMEM, - "MADV_POPULATE_WRITE with holes at the beginning\n"); - - munmap(addr, SIZE); -} - -static bool range_is_populated(char *start, ssize_t size) -{ - int fd = open("/proc/self/pagemap", O_RDONLY); - bool ret = true; - - if (fd < 0) - ksft_exit_fail_msg("opening pagemap failed\n"); - for (; size > 0 && ret; size -= pagesize, start += pagesize) - if (!pagemap_is_populated(fd, start)) - ret = false; - close(fd); - return ret; -} - -static bool range_is_not_populated(char *start, ssize_t size) -{ - int fd = open("/proc/self/pagemap", O_RDONLY); - bool ret = true; - - if (fd < 0) - ksft_exit_fail_msg("opening pagemap failed\n"); - for (; size > 0 && ret; size -= pagesize, start += pagesize) - if (pagemap_is_populated(fd, start)) - ret = false; - close(fd); - return ret; -} - -static void test_populate_read(void) -{ - char *addr; - int ret; - - ksft_print_msg("[RUN] %s\n", __func__); - - addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap failed\n"); - ksft_test_result(range_is_not_populated(addr, SIZE), - "range initially not populated\n"); - - ret = madvise(addr, SIZE, MADV_POPULATE_READ); - ksft_test_result(!ret, "MADV_POPULATE_READ\n"); - ksft_test_result(range_is_populated(addr, SIZE), - "range is populated\n"); - - munmap(addr, SIZE); -} - -static void test_populate_write(void) -{ - char *addr; - int ret; - - ksft_print_msg("[RUN] %s\n", __func__); - - addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap failed\n"); - ksft_test_result(range_is_not_populated(addr, SIZE), - "range initially not populated\n"); - - ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); - ksft_test_result(!ret, "MADV_POPULATE_WRITE\n"); - ksft_test_result(range_is_populated(addr, SIZE), - "range is populated\n"); - - munmap(addr, SIZE); -} - -static bool range_is_softdirty(char *start, ssize_t size) -{ - int fd = open("/proc/self/pagemap", O_RDONLY); - bool ret = true; - - if (fd < 0) - ksft_exit_fail_msg("opening pagemap failed\n"); - for (; size > 0 && ret; size -= pagesize, start += pagesize) - if (!pagemap_is_softdirty(fd, start)) - ret = false; - close(fd); - return ret; -} - -static bool range_is_not_softdirty(char *start, ssize_t size) -{ - int fd = open("/proc/self/pagemap", O_RDONLY); - bool ret = true; - - if (fd < 0) - ksft_exit_fail_msg("opening pagemap failed\n"); - for (; size > 0 && ret; size -= pagesize, start += pagesize) - if (pagemap_is_softdirty(fd, start)) - ret = false; - close(fd); - return ret; -} - -static void test_softdirty(void) -{ - char *addr; - int ret; - - ksft_print_msg("[RUN] %s\n", __func__); - - addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap failed\n"); - - /* Clear any softdirty bits. */ - clear_softdirty(); - ksft_test_result(range_is_not_softdirty(addr, SIZE), - "range is not softdirty\n"); - - /* Populating READ should set softdirty. */ - ret = madvise(addr, SIZE, MADV_POPULATE_READ); - ksft_test_result(!ret, "MADV_POPULATE_READ\n"); - ksft_test_result(range_is_not_softdirty(addr, SIZE), - "range is not softdirty\n"); - - /* Populating WRITE should set softdirty. */ - ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); - ksft_test_result(!ret, "MADV_POPULATE_WRITE\n"); - ksft_test_result(range_is_softdirty(addr, SIZE), - "range is softdirty\n"); - - munmap(addr, SIZE); -} - -int main(int argc, char **argv) -{ - int err; - - pagesize = getpagesize(); - - ksft_print_header(); - ksft_set_plan(21); - - sense_support(); - test_prot_read(); - test_prot_write(); - test_holes(); - test_populate_read(); - test_populate_write(); - test_softdirty(); - - err = ksft_get_fail_cnt(); - if (err) - ksft_exit_fail_msg("%d out of %d tests failed\n", - err, ksft_test_num()); - return ksft_exit_pass(); -} diff --git a/tools/testing/selftests/vm/map_fixed_noreplace.c b/tools/testing/selftests/vm/map_fixed_noreplace.c deleted file mode 100644 index eed44322d1a6..000000000000 --- a/tools/testing/selftests/vm/map_fixed_noreplace.c +++ /dev/null @@ -1,231 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* - * Test that MAP_FIXED_NOREPLACE works. - * - * Copyright 2018, Jann Horn - * Copyright 2018, Michael Ellerman, IBM Corporation. - */ - -#include -#include -#include -#include -#include - -#ifndef MAP_FIXED_NOREPLACE -#define MAP_FIXED_NOREPLACE 0x100000 -#endif - -static void dump_maps(void) -{ - char cmd[32]; - - snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid()); - system(cmd); -} - -static unsigned long find_base_addr(unsigned long size) -{ - void *addr; - unsigned long flags; - - flags = MAP_PRIVATE | MAP_ANONYMOUS; - addr = mmap(NULL, size, PROT_NONE, flags, -1, 0); - if (addr == MAP_FAILED) { - printf("Error: couldn't map the space we need for the test\n"); - return 0; - } - - if (munmap(addr, size) != 0) { - printf("Error: couldn't map the space we need for the test\n"); - return 0; - } - return (unsigned long)addr; -} - -int main(void) -{ - unsigned long base_addr; - unsigned long flags, addr, size, page_size; - char *p; - - page_size = sysconf(_SC_PAGE_SIZE); - - //let's find a base addr that is free before we start the tests - size = 5 * page_size; - base_addr = find_base_addr(size); - if (!base_addr) { - printf("Error: couldn't map the space we need for the test\n"); - return 1; - } - - flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE; - - // Check we can map all the areas we need below - errno = 0; - addr = base_addr; - size = 5 * page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p == MAP_FAILED) { - dump_maps(); - printf("Error: couldn't map the space we need for the test\n"); - return 1; - } - - errno = 0; - if (munmap((void *)addr, 5 * page_size) != 0) { - dump_maps(); - printf("Error: munmap failed!?\n"); - return 1; - } - printf("unmap() successful\n"); - - errno = 0; - addr = base_addr + page_size; - size = 3 * page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p == MAP_FAILED) { - dump_maps(); - printf("Error: first mmap() failed unexpectedly\n"); - return 1; - } - - /* - * Exact same mapping again: - * base | free | new - * +1 | mapped | new - * +2 | mapped | new - * +3 | mapped | new - * +4 | free | new - */ - errno = 0; - addr = base_addr; - size = 5 * page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p != MAP_FAILED) { - dump_maps(); - printf("Error:1: mmap() succeeded when it shouldn't have\n"); - return 1; - } - - /* - * Second mapping contained within first: - * - * base | free | - * +1 | mapped | - * +2 | mapped | new - * +3 | mapped | - * +4 | free | - */ - errno = 0; - addr = base_addr + (2 * page_size); - size = page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p != MAP_FAILED) { - dump_maps(); - printf("Error:2: mmap() succeeded when it shouldn't have\n"); - return 1; - } - - /* - * Overlap end of existing mapping: - * base | free | - * +1 | mapped | - * +2 | mapped | - * +3 | mapped | new - * +4 | free | new - */ - errno = 0; - addr = base_addr + (3 * page_size); - size = 2 * page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p != MAP_FAILED) { - dump_maps(); - printf("Error:3: mmap() succeeded when it shouldn't have\n"); - return 1; - } - - /* - * Overlap start of existing mapping: - * base | free | new - * +1 | mapped | new - * +2 | mapped | - * +3 | mapped | - * +4 | free | - */ - errno = 0; - addr = base_addr; - size = 2 * page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p != MAP_FAILED) { - dump_maps(); - printf("Error:4: mmap() succeeded when it shouldn't have\n"); - return 1; - } - - /* - * Adjacent to start of existing mapping: - * base | free | new - * +1 | mapped | - * +2 | mapped | - * +3 | mapped | - * +4 | free | - */ - errno = 0; - addr = base_addr; - size = page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p == MAP_FAILED) { - dump_maps(); - printf("Error:5: mmap() failed when it shouldn't have\n"); - return 1; - } - - /* - * Adjacent to end of existing mapping: - * base | free | - * +1 | mapped | - * +2 | mapped | - * +3 | mapped | - * +4 | free | new - */ - errno = 0; - addr = base_addr + (4 * page_size); - size = page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p == MAP_FAILED) { - dump_maps(); - printf("Error:6: mmap() failed when it shouldn't have\n"); - return 1; - } - - addr = base_addr; - size = 5 * page_size; - if (munmap((void *)addr, size) != 0) { - dump_maps(); - printf("Error: munmap failed!?\n"); - return 1; - } - printf("unmap() successful\n"); - - printf("OK\n"); - return 0; -} diff --git a/tools/testing/selftests/vm/map_hugetlb.c b/tools/testing/selftests/vm/map_hugetlb.c deleted file mode 100644 index 312889edb84a..000000000000 --- a/tools/testing/selftests/vm/map_hugetlb.c +++ /dev/null @@ -1,109 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Example of using hugepage memory in a user application using the mmap - * system call with MAP_HUGETLB flag. Before running this program make - * sure the administrator has allocated enough default sized huge pages - * to cover the 256 MB allocation. - * - * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. - * That means the addresses starting with 0x800000... will need to be - * specified. Specifying a fixed address is not required on ppc64, i386 - * or x86_64. - */ -#include -#include -#include -#include -#include - -#define LENGTH (256UL*1024*1024) -#define PROTECTION (PROT_READ | PROT_WRITE) - -#ifndef MAP_HUGETLB -#define MAP_HUGETLB 0x40000 /* arch specific */ -#endif - -#ifndef MAP_HUGE_SHIFT -#define MAP_HUGE_SHIFT 26 -#endif - -#ifndef MAP_HUGE_MASK -#define MAP_HUGE_MASK 0x3f -#endif - -/* Only ia64 requires this */ -#ifdef __ia64__ -#define ADDR (void *)(0x8000000000000000UL) -#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) -#else -#define ADDR (void *)(0x0UL) -#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) -#endif - -static void check_bytes(char *addr) -{ - printf("First hex is %x\n", *((unsigned int *)addr)); -} - -static void write_bytes(char *addr, size_t length) -{ - unsigned long i; - - for (i = 0; i < length; i++) - *(addr + i) = (char)i; -} - -static int read_bytes(char *addr, size_t length) -{ - unsigned long i; - - check_bytes(addr); - for (i = 0; i < length; i++) - if (*(addr + i) != (char)i) { - printf("Mismatch at %lu\n", i); - return 1; - } - return 0; -} - -int main(int argc, char **argv) -{ - void *addr; - int ret; - size_t length = LENGTH; - int flags = FLAGS; - int shift = 0; - - if (argc > 1) - length = atol(argv[1]) << 20; - if (argc > 2) { - shift = atoi(argv[2]); - if (shift) - flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; - } - - if (shift) - printf("%u kB hugepages\n", 1 << (shift - 10)); - else - printf("Default size hugepages\n"); - printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20); - - addr = mmap(ADDR, length, PROTECTION, flags, -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - printf("Returned address is %p\n", addr); - check_bytes(addr); - write_bytes(addr, length); - ret = read_bytes(addr, length); - - /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ - if (munmap(addr, length)) { - perror("munmap"); - exit(1); - } - - return ret; -} diff --git a/tools/testing/selftests/vm/map_populate.c b/tools/testing/selftests/vm/map_populate.c deleted file mode 100644 index 6b8aeaa0bf7a..000000000000 --- a/tools/testing/selftests/vm/map_populate.c +++ /dev/null @@ -1,113 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2018 Dmitry Safonov, Arista Networks - * - * MAP_POPULATE | MAP_PRIVATE should COW VMA pages. - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef MMAP_SZ -#define MMAP_SZ 4096 -#endif - -#define BUG_ON(condition, description) \ - do { \ - if (condition) { \ - fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \ - __LINE__, (description), strerror(errno)); \ - exit(1); \ - } \ - } while (0) - -static int parent_f(int sock, unsigned long *smap, int child) -{ - int status, ret; - - ret = read(sock, &status, sizeof(int)); - BUG_ON(ret <= 0, "read(sock)"); - - *smap = 0x22222BAD; - ret = msync(smap, MMAP_SZ, MS_SYNC); - BUG_ON(ret, "msync()"); - - ret = write(sock, &status, sizeof(int)); - BUG_ON(ret <= 0, "write(sock)"); - - waitpid(child, &status, 0); - BUG_ON(!WIFEXITED(status), "child in unexpected state"); - - return WEXITSTATUS(status); -} - -static int child_f(int sock, unsigned long *smap, int fd) -{ - int ret, buf = 0; - - smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_POPULATE, fd, 0); - BUG_ON(smap == MAP_FAILED, "mmap()"); - - BUG_ON(*smap != 0xdeadbabe, "MAP_PRIVATE | MAP_POPULATE changed file"); - - ret = write(sock, &buf, sizeof(int)); - BUG_ON(ret <= 0, "write(sock)"); - - ret = read(sock, &buf, sizeof(int)); - BUG_ON(ret <= 0, "read(sock)"); - - BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page"); - BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted"); - - return 0; -} - -int main(int argc, char **argv) -{ - int sock[2], child, ret; - FILE *ftmp; - unsigned long *smap; - - ftmp = tmpfile(); - BUG_ON(ftmp == 0, "tmpfile()"); - - ret = ftruncate(fileno(ftmp), MMAP_SZ); - BUG_ON(ret, "ftruncate()"); - - smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE, - MAP_SHARED, fileno(ftmp), 0); - BUG_ON(smap == MAP_FAILED, "mmap()"); - - *smap = 0xdeadbabe; - /* Probably unnecessary, but let it be. */ - ret = msync(smap, MMAP_SZ, MS_SYNC); - BUG_ON(ret, "msync()"); - - ret = socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sock); - BUG_ON(ret, "socketpair()"); - - child = fork(); - BUG_ON(child == -1, "fork()"); - - if (child) { - ret = close(sock[0]); - BUG_ON(ret, "close()"); - - return parent_f(sock[1], smap, child); - } - - ret = close(sock[1]); - BUG_ON(ret, "close()"); - - return child_f(sock[0], smap, fileno(ftmp)); -} diff --git a/tools/testing/selftests/vm/memfd_secret.c b/tools/testing/selftests/vm/memfd_secret.c deleted file mode 100644 index 957b9e18c729..000000000000 --- a/tools/testing/selftests/vm/memfd_secret.c +++ /dev/null @@ -1,296 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright IBM Corporation, 2021 - * - * Author: Mike Rapoport - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "../kselftest.h" - -#define fail(fmt, ...) ksft_test_result_fail(fmt, ##__VA_ARGS__) -#define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__) -#define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__) - -#ifdef __NR_memfd_secret - -#define PATTERN 0x55 - -static const int prot = PROT_READ | PROT_WRITE; -static const int mode = MAP_SHARED; - -static unsigned long page_size; -static unsigned long mlock_limit_cur; -static unsigned long mlock_limit_max; - -static int memfd_secret(unsigned int flags) -{ - return syscall(__NR_memfd_secret, flags); -} - -static void test_file_apis(int fd) -{ - char buf[64]; - - if ((read(fd, buf, sizeof(buf)) >= 0) || - (write(fd, buf, sizeof(buf)) >= 0) || - (pread(fd, buf, sizeof(buf), 0) >= 0) || - (pwrite(fd, buf, sizeof(buf), 0) >= 0)) - fail("unexpected file IO\n"); - else - pass("file IO is blocked as expected\n"); -} - -static void test_mlock_limit(int fd) -{ - size_t len; - char *mem; - - len = mlock_limit_cur; - mem = mmap(NULL, len, prot, mode, fd, 0); - if (mem == MAP_FAILED) { - fail("unable to mmap secret memory\n"); - return; - } - munmap(mem, len); - - len = mlock_limit_max * 2; - mem = mmap(NULL, len, prot, mode, fd, 0); - if (mem != MAP_FAILED) { - fail("unexpected mlock limit violation\n"); - munmap(mem, len); - return; - } - - pass("mlock limit is respected\n"); -} - -static void try_process_vm_read(int fd, int pipefd[2]) -{ - struct iovec liov, riov; - char buf[64]; - char *mem; - - if (read(pipefd[0], &mem, sizeof(mem)) < 0) { - fail("pipe write: %s\n", strerror(errno)); - exit(KSFT_FAIL); - } - - liov.iov_len = riov.iov_len = sizeof(buf); - liov.iov_base = buf; - riov.iov_base = mem; - - if (process_vm_readv(getppid(), &liov, 1, &riov, 1, 0) < 0) { - if (errno == ENOSYS) - exit(KSFT_SKIP); - exit(KSFT_PASS); - } - - exit(KSFT_FAIL); -} - -static void try_ptrace(int fd, int pipefd[2]) -{ - pid_t ppid = getppid(); - int status; - char *mem; - long ret; - - if (read(pipefd[0], &mem, sizeof(mem)) < 0) { - perror("pipe write"); - exit(KSFT_FAIL); - } - - ret = ptrace(PTRACE_ATTACH, ppid, 0, 0); - if (ret) { - perror("ptrace_attach"); - exit(KSFT_FAIL); - } - - ret = waitpid(ppid, &status, WUNTRACED); - if ((ret != ppid) || !(WIFSTOPPED(status))) { - fprintf(stderr, "weird waitppid result %ld stat %x\n", - ret, status); - exit(KSFT_FAIL); - } - - if (ptrace(PTRACE_PEEKDATA, ppid, mem, 0)) - exit(KSFT_PASS); - - exit(KSFT_FAIL); -} - -static void check_child_status(pid_t pid, const char *name) -{ - int status; - - waitpid(pid, &status, 0); - - if (WIFEXITED(status) && WEXITSTATUS(status) == KSFT_SKIP) { - skip("%s is not supported\n", name); - return; - } - - if ((WIFEXITED(status) && WEXITSTATUS(status) == KSFT_PASS) || - WIFSIGNALED(status)) { - pass("%s is blocked as expected\n", name); - return; - } - - fail("%s: unexpected memory access\n", name); -} - -static void test_remote_access(int fd, const char *name, - void (*func)(int fd, int pipefd[2])) -{ - int pipefd[2]; - pid_t pid; - char *mem; - - if (pipe(pipefd)) { - fail("pipe failed: %s\n", strerror(errno)); - return; - } - - pid = fork(); - if (pid < 0) { - fail("fork failed: %s\n", strerror(errno)); - return; - } - - if (pid == 0) { - func(fd, pipefd); - return; - } - - mem = mmap(NULL, page_size, prot, mode, fd, 0); - if (mem == MAP_FAILED) { - fail("Unable to mmap secret memory\n"); - return; - } - - ftruncate(fd, page_size); - memset(mem, PATTERN, page_size); - - if (write(pipefd[1], &mem, sizeof(mem)) < 0) { - fail("pipe write: %s\n", strerror(errno)); - return; - } - - check_child_status(pid, name); -} - -static void test_process_vm_read(int fd) -{ - test_remote_access(fd, "process_vm_read", try_process_vm_read); -} - -static void test_ptrace(int fd) -{ - test_remote_access(fd, "ptrace", try_ptrace); -} - -static int set_cap_limits(rlim_t max) -{ - struct rlimit new; - cap_t cap = cap_init(); - - new.rlim_cur = max; - new.rlim_max = max; - if (setrlimit(RLIMIT_MEMLOCK, &new)) { - perror("setrlimit() returns error"); - return -1; - } - - /* drop capabilities including CAP_IPC_LOCK */ - if (cap_set_proc(cap)) { - perror("cap_set_proc() returns error"); - return -2; - } - - return 0; -} - -static void prepare(void) -{ - struct rlimit rlim; - - page_size = sysconf(_SC_PAGE_SIZE); - if (!page_size) - ksft_exit_fail_msg("Failed to get page size %s\n", - strerror(errno)); - - if (getrlimit(RLIMIT_MEMLOCK, &rlim)) - ksft_exit_fail_msg("Unable to detect mlock limit: %s\n", - strerror(errno)); - - mlock_limit_cur = rlim.rlim_cur; - mlock_limit_max = rlim.rlim_max; - - printf("page_size: %ld, mlock.soft: %ld, mlock.hard: %ld\n", - page_size, mlock_limit_cur, mlock_limit_max); - - if (page_size > mlock_limit_cur) - mlock_limit_cur = page_size; - if (page_size > mlock_limit_max) - mlock_limit_max = page_size; - - if (set_cap_limits(mlock_limit_max)) - ksft_exit_fail_msg("Unable to set mlock limit: %s\n", - strerror(errno)); -} - -#define NUM_TESTS 4 - -int main(int argc, char *argv[]) -{ - int fd; - - prepare(); - - ksft_print_header(); - ksft_set_plan(NUM_TESTS); - - fd = memfd_secret(0); - if (fd < 0) { - if (errno == ENOSYS) - ksft_exit_skip("memfd_secret is not supported\n"); - else - ksft_exit_fail_msg("memfd_secret failed: %s\n", - strerror(errno)); - } - - test_mlock_limit(fd); - test_file_apis(fd); - test_process_vm_read(fd); - test_ptrace(fd); - - close(fd); - - ksft_finished(); -} - -#else /* __NR_memfd_secret */ - -int main(int argc, char *argv[]) -{ - printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n"); - return KSFT_SKIP; -} - -#endif /* __NR_memfd_secret */ diff --git a/tools/testing/selftests/vm/migration.c b/tools/testing/selftests/vm/migration.c deleted file mode 100644 index 1cec8425e3ca..000000000000 --- a/tools/testing/selftests/vm/migration.c +++ /dev/null @@ -1,193 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * The main purpose of the tests here is to exercise the migration entry code - * paths in the kernel. - */ - -#include "../kselftest_harness.h" -#include -#include -#include -#include -#include -#include -#include -#include - -#define TWOMEG (2<<20) -#define RUNTIME (60) - -#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) - -FIXTURE(migration) -{ - pthread_t *threads; - pid_t *pids; - int nthreads; - int n1; - int n2; -}; - -FIXTURE_SETUP(migration) -{ - int n; - - ASSERT_EQ(numa_available(), 0); - self->nthreads = numa_num_task_cpus() - 1; - self->n1 = -1; - self->n2 = -1; - - for (n = 0; n < numa_max_possible_node(); n++) - if (numa_bitmask_isbitset(numa_all_nodes_ptr, n)) { - if (self->n1 == -1) { - self->n1 = n; - } else { - self->n2 = n; - break; - } - } - - self->threads = malloc(self->nthreads * sizeof(*self->threads)); - ASSERT_NE(self->threads, NULL); - self->pids = malloc(self->nthreads * sizeof(*self->pids)); - ASSERT_NE(self->pids, NULL); -}; - -FIXTURE_TEARDOWN(migration) -{ - free(self->threads); - free(self->pids); -} - -int migrate(uint64_t *ptr, int n1, int n2) -{ - int ret, tmp; - int status = 0; - struct timespec ts1, ts2; - - if (clock_gettime(CLOCK_MONOTONIC, &ts1)) - return -1; - - while (1) { - if (clock_gettime(CLOCK_MONOTONIC, &ts2)) - return -1; - - if (ts2.tv_sec - ts1.tv_sec >= RUNTIME) - return 0; - - ret = move_pages(0, 1, (void **) &ptr, &n2, &status, - MPOL_MF_MOVE_ALL); - if (ret) { - if (ret > 0) - printf("Didn't migrate %d pages\n", ret); - else - perror("Couldn't migrate pages"); - return -2; - } - - tmp = n2; - n2 = n1; - n1 = tmp; - } - - return 0; -} - -void *access_mem(void *ptr) -{ - uint64_t y = 0; - volatile uint64_t *x = ptr; - - while (1) { - pthread_testcancel(); - y += *x; - } - - return NULL; -} - -/* - * Basic migration entry testing. One thread will move pages back and forth - * between nodes whilst other threads try and access them triggering the - * migration entry wait paths in the kernel. - */ -TEST_F_TIMEOUT(migration, private_anon, 2*RUNTIME) -{ - uint64_t *ptr; - int i; - - if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) - SKIP(return, "Not enough threads or NUMA nodes available"); - - ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - ASSERT_NE(ptr, MAP_FAILED); - - memset(ptr, 0xde, TWOMEG); - for (i = 0; i < self->nthreads - 1; i++) - if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) - perror("Couldn't create thread"); - - ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); - for (i = 0; i < self->nthreads - 1; i++) - ASSERT_EQ(pthread_cancel(self->threads[i]), 0); -} - -/* - * Same as the previous test but with shared memory. - */ -TEST_F_TIMEOUT(migration, shared_anon, 2*RUNTIME) -{ - pid_t pid; - uint64_t *ptr; - int i; - - if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) - SKIP(return, "Not enough threads or NUMA nodes available"); - - ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); - ASSERT_NE(ptr, MAP_FAILED); - - memset(ptr, 0xde, TWOMEG); - for (i = 0; i < self->nthreads - 1; i++) { - pid = fork(); - if (!pid) - access_mem(ptr); - else - self->pids[i] = pid; - } - - ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); - for (i = 0; i < self->nthreads - 1; i++) - ASSERT_EQ(kill(self->pids[i], SIGTERM), 0); -} - -/* - * Tests the pmd migration entry paths. - */ -TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME) -{ - uint64_t *ptr; - int i; - - if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) - SKIP(return, "Not enough threads or NUMA nodes available"); - - ptr = mmap(NULL, 2*TWOMEG, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - ASSERT_NE(ptr, MAP_FAILED); - - ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG); - ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0); - memset(ptr, 0xde, TWOMEG); - for (i = 0; i < self->nthreads - 1; i++) - if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) - perror("Couldn't create thread"); - - ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); - for (i = 0; i < self->nthreads - 1; i++) - ASSERT_EQ(pthread_cancel(self->threads[i]), 0); -} - -TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/vm/mlock-random-test.c b/tools/testing/selftests/vm/mlock-random-test.c deleted file mode 100644 index 782ea94dee2f..000000000000 --- a/tools/testing/selftests/vm/mlock-random-test.c +++ /dev/null @@ -1,294 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * It tests the mlock/mlock2() when they are invoked - * on randomly memory region. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "mlock2.h" - -#define CHUNK_UNIT (128 * 1024) -#define MLOCK_RLIMIT_SIZE (CHUNK_UNIT * 2) -#define MLOCK_WITHIN_LIMIT_SIZE CHUNK_UNIT -#define MLOCK_OUTOF_LIMIT_SIZE (CHUNK_UNIT * 3) - -#define TEST_LOOP 100 -#define PAGE_ALIGN(size, ps) (((size) + ((ps) - 1)) & ~((ps) - 1)) - -int set_cap_limits(rlim_t max) -{ - struct rlimit new; - cap_t cap = cap_init(); - - new.rlim_cur = max; - new.rlim_max = max; - if (setrlimit(RLIMIT_MEMLOCK, &new)) { - perror("setrlimit() returns error\n"); - return -1; - } - - /* drop capabilities including CAP_IPC_LOCK */ - if (cap_set_proc(cap)) { - perror("cap_set_proc() returns error\n"); - return -2; - } - - return 0; -} - -int get_proc_locked_vm_size(void) -{ - FILE *f; - int ret = -1; - char line[1024] = {0}; - unsigned long lock_size = 0; - - f = fopen("/proc/self/status", "r"); - if (!f) { - perror("fopen"); - return -1; - } - - while (fgets(line, 1024, f)) { - if (strstr(line, "VmLck")) { - ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size); - if (ret <= 0) { - printf("sscanf() on VmLck error: %s: %d\n", - line, ret); - fclose(f); - return -1; - } - fclose(f); - return (int)(lock_size << 10); - } - } - - perror("cannot parse VmLck in /proc/self/status\n"); - fclose(f); - return -1; -} - -/* - * Get the MMUPageSize of the memory region including input - * address from proc file. - * - * return value: on error case, 0 will be returned. - * Otherwise the page size(in bytes) is returned. - */ -int get_proc_page_size(unsigned long addr) -{ - FILE *smaps; - char *line; - unsigned long mmupage_size = 0; - size_t size; - - smaps = seek_to_smaps_entry(addr); - if (!smaps) { - printf("Unable to parse /proc/self/smaps\n"); - return 0; - } - - while (getline(&line, &size, smaps) > 0) { - if (!strstr(line, "MMUPageSize")) { - free(line); - line = NULL; - size = 0; - continue; - } - - /* found the MMUPageSize of this section */ - if (sscanf(line, "MMUPageSize: %8lu kB", - &mmupage_size) < 1) { - printf("Unable to parse smaps entry for Size:%s\n", - line); - break; - } - - } - free(line); - if (smaps) - fclose(smaps); - return mmupage_size << 10; -} - -/* - * Test mlock/mlock2() on provided memory chunk. - * It expects the mlock/mlock2() to be successful (within rlimit) - * - * With allocated memory chunk [p, p + alloc_size), this - * test will choose start/len randomly to perform mlock/mlock2 - * [start, start + len] memory range. The range is within range - * of the allocated chunk. - * - * The memory region size alloc_size is within the rlimit. - * So we always expect a success of mlock/mlock2. - * - * VmLck is assumed to be 0 before this test. - * - * return value: 0 - success - * else: failure - */ -int test_mlock_within_limit(char *p, int alloc_size) -{ - int i; - int ret = 0; - int locked_vm_size = 0; - struct rlimit cur; - int page_size = 0; - - getrlimit(RLIMIT_MEMLOCK, &cur); - if (cur.rlim_cur < alloc_size) { - printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n", - alloc_size, (unsigned int)cur.rlim_cur); - return -1; - } - - srand(time(NULL)); - for (i = 0; i < TEST_LOOP; i++) { - /* - * - choose mlock/mlock2 randomly - * - choose lock_size randomly but lock_size < alloc_size - * - choose start_offset randomly but p+start_offset+lock_size - * < p+alloc_size - */ - int is_mlock = !!(rand() % 2); - int lock_size = rand() % alloc_size; - int start_offset = rand() % (alloc_size - lock_size); - - if (is_mlock) - ret = mlock(p + start_offset, lock_size); - else - ret = mlock2_(p + start_offset, lock_size, - MLOCK_ONFAULT); - - if (ret) { - printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n", - is_mlock ? "mlock" : "mlock2", - p, alloc_size, - p + start_offset, lock_size); - return ret; - } - } - - /* - * Check VmLck left by the tests. - */ - locked_vm_size = get_proc_locked_vm_size(); - page_size = get_proc_page_size((unsigned long)p); - if (page_size == 0) { - printf("cannot get proc MMUPageSize\n"); - return -1; - } - - if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) { - printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n", - locked_vm_size, alloc_size); - return -1; - } - - return 0; -} - - -/* - * We expect the mlock/mlock2() to be fail (outof limitation) - * - * With allocated memory chunk [p, p + alloc_size), this - * test will randomly choose start/len and perform mlock/mlock2 - * on [start, start+len] range. - * - * The memory region size alloc_size is above the rlimit. - * And the len to be locked is higher than rlimit. - * So we always expect a failure of mlock/mlock2. - * No locked page number should be increased as a side effect. - * - * return value: 0 - success - * else: failure - */ -int test_mlock_outof_limit(char *p, int alloc_size) -{ - int i; - int ret = 0; - int locked_vm_size = 0, old_locked_vm_size = 0; - struct rlimit cur; - - getrlimit(RLIMIT_MEMLOCK, &cur); - if (cur.rlim_cur >= alloc_size) { - printf("alloc_size[%d] >%u rlimit, violates test condition\n", - alloc_size, (unsigned int)cur.rlim_cur); - return -1; - } - - old_locked_vm_size = get_proc_locked_vm_size(); - srand(time(NULL)); - for (i = 0; i < TEST_LOOP; i++) { - int is_mlock = !!(rand() % 2); - int lock_size = (rand() % (alloc_size - cur.rlim_cur)) - + cur.rlim_cur; - int start_offset = rand() % (alloc_size - lock_size); - - if (is_mlock) - ret = mlock(p + start_offset, lock_size); - else - ret = mlock2_(p + start_offset, lock_size, - MLOCK_ONFAULT); - if (ret == 0) { - printf("%s() succeeds? on %p(%d) mlock%p(%d)\n", - is_mlock ? "mlock" : "mlock2", - p, alloc_size, - p + start_offset, lock_size); - return -1; - } - } - - locked_vm_size = get_proc_locked_vm_size(); - if (locked_vm_size != old_locked_vm_size) { - printf("tests leads to new mlocked page: old[%d], new[%d]\n", - old_locked_vm_size, - locked_vm_size); - return -1; - } - - return 0; -} - -int main(int argc, char **argv) -{ - char *p = NULL; - int ret = 0; - - if (set_cap_limits(MLOCK_RLIMIT_SIZE)) - return -1; - - p = malloc(MLOCK_WITHIN_LIMIT_SIZE); - if (p == NULL) { - perror("malloc() failure\n"); - return -1; - } - ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE); - if (ret) - return ret; - munlock(p, MLOCK_WITHIN_LIMIT_SIZE); - free(p); - - - p = malloc(MLOCK_OUTOF_LIMIT_SIZE); - if (p == NULL) { - perror("malloc() failure\n"); - return -1; - } - ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE); - if (ret) - return ret; - munlock(p, MLOCK_OUTOF_LIMIT_SIZE); - free(p); - - return 0; -} diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/vm/mlock2-tests.c deleted file mode 100644 index 11b2301f3aa3..000000000000 --- a/tools/testing/selftests/vm/mlock2-tests.c +++ /dev/null @@ -1,520 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include "mlock2.h" - -#include "../kselftest.h" - -struct vm_boundaries { - unsigned long start; - unsigned long end; -}; - -static int get_vm_area(unsigned long addr, struct vm_boundaries *area) -{ - FILE *file; - int ret = 1; - char line[1024] = {0}; - char *end_addr; - char *stop; - unsigned long start; - unsigned long end; - - if (!area) - return ret; - - file = fopen("/proc/self/maps", "r"); - if (!file) { - perror("fopen"); - return ret; - } - - memset(area, 0, sizeof(struct vm_boundaries)); - - while(fgets(line, 1024, file)) { - end_addr = strchr(line, '-'); - if (!end_addr) { - printf("cannot parse /proc/self/maps\n"); - goto out; - } - *end_addr = '\0'; - end_addr++; - stop = strchr(end_addr, ' '); - if (!stop) { - printf("cannot parse /proc/self/maps\n"); - goto out; - } - stop = '\0'; - - sscanf(line, "%lx", &start); - sscanf(end_addr, "%lx", &end); - - if (start <= addr && end > addr) { - area->start = start; - area->end = end; - ret = 0; - goto out; - } - } -out: - fclose(file); - return ret; -} - -#define VMFLAGS "VmFlags:" - -static bool is_vmflag_set(unsigned long addr, const char *vmflag) -{ - char *line = NULL; - char *flags; - size_t size = 0; - bool ret = false; - FILE *smaps; - - smaps = seek_to_smaps_entry(addr); - if (!smaps) { - printf("Unable to parse /proc/self/smaps\n"); - goto out; - } - - while (getline(&line, &size, smaps) > 0) { - if (!strstr(line, VMFLAGS)) { - free(line); - line = NULL; - size = 0; - continue; - } - - flags = line + strlen(VMFLAGS); - ret = (strstr(flags, vmflag) != NULL); - goto out; - } - -out: - free(line); - fclose(smaps); - return ret; -} - -#define SIZE "Size:" -#define RSS "Rss:" -#define LOCKED "lo" - -static unsigned long get_value_for_name(unsigned long addr, const char *name) -{ - char *line = NULL; - size_t size = 0; - char *value_ptr; - FILE *smaps = NULL; - unsigned long value = -1UL; - - smaps = seek_to_smaps_entry(addr); - if (!smaps) { - printf("Unable to parse /proc/self/smaps\n"); - goto out; - } - - while (getline(&line, &size, smaps) > 0) { - if (!strstr(line, name)) { - free(line); - line = NULL; - size = 0; - continue; - } - - value_ptr = line + strlen(name); - if (sscanf(value_ptr, "%lu kB", &value) < 1) { - printf("Unable to parse smaps entry for Size\n"); - goto out; - } - break; - } - -out: - if (smaps) - fclose(smaps); - free(line); - return value; -} - -static bool is_vma_lock_on_fault(unsigned long addr) -{ - bool locked; - unsigned long vma_size, vma_rss; - - locked = is_vmflag_set(addr, LOCKED); - if (!locked) - return false; - - vma_size = get_value_for_name(addr, SIZE); - vma_rss = get_value_for_name(addr, RSS); - - /* only one page is faulted in */ - return (vma_rss < vma_size); -} - -#define PRESENT_BIT 0x8000000000000000ULL -#define PFN_MASK 0x007FFFFFFFFFFFFFULL -#define UNEVICTABLE_BIT (1UL << 18) - -static int lock_check(unsigned long addr) -{ - bool locked; - unsigned long vma_size, vma_rss; - - locked = is_vmflag_set(addr, LOCKED); - if (!locked) - return false; - - vma_size = get_value_for_name(addr, SIZE); - vma_rss = get_value_for_name(addr, RSS); - - return (vma_rss == vma_size); -} - -static int unlock_lock_check(char *map) -{ - if (is_vmflag_set((unsigned long)map, LOCKED)) { - printf("VMA flag %s is present on page 1 after unlock\n", LOCKED); - return 1; - } - - return 0; -} - -static int test_mlock_lock() -{ - char *map; - int ret = 1; - unsigned long page_size = getpagesize(); - - map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (map == MAP_FAILED) { - perror("test_mlock_locked mmap"); - goto out; - } - - if (mlock2_(map, 2 * page_size, 0)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("mlock2(0)"); - goto unmap; - } - - if (!lock_check((unsigned long)map)) - goto unmap; - - /* Now unlock and recheck attributes */ - if (munlock(map, 2 * page_size)) { - perror("munlock()"); - goto unmap; - } - - ret = unlock_lock_check(map); - -unmap: - munmap(map, 2 * page_size); -out: - return ret; -} - -static int onfault_check(char *map) -{ - *map = 'a'; - if (!is_vma_lock_on_fault((unsigned long)map)) { - printf("VMA is not marked for lock on fault\n"); - return 1; - } - - return 0; -} - -static int unlock_onfault_check(char *map) -{ - unsigned long page_size = getpagesize(); - - if (is_vma_lock_on_fault((unsigned long)map) || - is_vma_lock_on_fault((unsigned long)map + page_size)) { - printf("VMA is still lock on fault after unlock\n"); - return 1; - } - - return 0; -} - -static int test_mlock_onfault() -{ - char *map; - int ret = 1; - unsigned long page_size = getpagesize(); - - map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (map == MAP_FAILED) { - perror("test_mlock_locked mmap"); - goto out; - } - - if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("mlock2(MLOCK_ONFAULT)"); - goto unmap; - } - - if (onfault_check(map)) - goto unmap; - - /* Now unlock and recheck attributes */ - if (munlock(map, 2 * page_size)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("munlock()"); - goto unmap; - } - - ret = unlock_onfault_check(map); -unmap: - munmap(map, 2 * page_size); -out: - return ret; -} - -static int test_lock_onfault_of_present() -{ - char *map; - int ret = 1; - unsigned long page_size = getpagesize(); - - map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (map == MAP_FAILED) { - perror("test_mlock_locked mmap"); - goto out; - } - - *map = 'a'; - - if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("mlock2(MLOCK_ONFAULT)"); - goto unmap; - } - - if (!is_vma_lock_on_fault((unsigned long)map) || - !is_vma_lock_on_fault((unsigned long)map + page_size)) { - printf("VMA with present pages is not marked lock on fault\n"); - goto unmap; - } - ret = 0; -unmap: - munmap(map, 2 * page_size); -out: - return ret; -} - -static int test_munlockall() -{ - char *map; - int ret = 1; - unsigned long page_size = getpagesize(); - - map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - - if (map == MAP_FAILED) { - perror("test_munlockall mmap"); - goto out; - } - - if (mlockall(MCL_CURRENT)) { - perror("mlockall(MCL_CURRENT)"); - goto out; - } - - if (!lock_check((unsigned long)map)) - goto unmap; - - if (munlockall()) { - perror("munlockall()"); - goto unmap; - } - - if (unlock_lock_check(map)) - goto unmap; - - munmap(map, 2 * page_size); - - map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - - if (map == MAP_FAILED) { - perror("test_munlockall second mmap"); - goto out; - } - - if (mlockall(MCL_CURRENT | MCL_ONFAULT)) { - perror("mlockall(MCL_CURRENT | MCL_ONFAULT)"); - goto unmap; - } - - if (onfault_check(map)) - goto unmap; - - if (munlockall()) { - perror("munlockall()"); - goto unmap; - } - - if (unlock_onfault_check(map)) - goto unmap; - - if (mlockall(MCL_CURRENT | MCL_FUTURE)) { - perror("mlockall(MCL_CURRENT | MCL_FUTURE)"); - goto out; - } - - if (!lock_check((unsigned long)map)) - goto unmap; - - if (munlockall()) { - perror("munlockall()"); - goto unmap; - } - - ret = unlock_lock_check(map); - -unmap: - munmap(map, 2 * page_size); -out: - munlockall(); - return ret; -} - -static int test_vma_management(bool call_mlock) -{ - int ret = 1; - void *map; - unsigned long page_size = getpagesize(); - struct vm_boundaries page1; - struct vm_boundaries page2; - struct vm_boundaries page3; - - map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (map == MAP_FAILED) { - perror("mmap()"); - return ret; - } - - if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("mlock(ONFAULT)\n"); - goto out; - } - - if (get_vm_area((unsigned long)map, &page1) || - get_vm_area((unsigned long)map + page_size, &page2) || - get_vm_area((unsigned long)map + page_size * 2, &page3)) { - printf("couldn't find mapping in /proc/self/maps\n"); - goto out; - } - - /* - * Before we unlock a portion, we need to that all three pages are in - * the same VMA. If they are not we abort this test (Note that this is - * not a failure) - */ - if (page1.start != page2.start || page2.start != page3.start) { - printf("VMAs are not merged to start, aborting test\n"); - ret = 0; - goto out; - } - - if (munlock(map + page_size, page_size)) { - perror("munlock()"); - goto out; - } - - if (get_vm_area((unsigned long)map, &page1) || - get_vm_area((unsigned long)map + page_size, &page2) || - get_vm_area((unsigned long)map + page_size * 2, &page3)) { - printf("couldn't find mapping in /proc/self/maps\n"); - goto out; - } - - /* All three VMAs should be different */ - if (page1.start == page2.start || page2.start == page3.start) { - printf("failed to split VMA for munlock\n"); - goto out; - } - - /* Now unlock the first and third page and check the VMAs again */ - if (munlock(map, page_size * 3)) { - perror("munlock()"); - goto out; - } - - if (get_vm_area((unsigned long)map, &page1) || - get_vm_area((unsigned long)map + page_size, &page2) || - get_vm_area((unsigned long)map + page_size * 2, &page3)) { - printf("couldn't find mapping in /proc/self/maps\n"); - goto out; - } - - /* Now all three VMAs should be the same */ - if (page1.start != page2.start || page2.start != page3.start) { - printf("failed to merge VMAs after munlock\n"); - goto out; - } - - ret = 0; -out: - munmap(map, 3 * page_size); - return ret; -} - -static int test_mlockall(int (test_function)(bool call_mlock)) -{ - int ret = 1; - - if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) { - perror("mlockall"); - return ret; - } - - ret = test_function(false); - munlockall(); - return ret; -} - -int main(int argc, char **argv) -{ - int ret = 0; - ret += test_mlock_lock(); - ret += test_mlock_onfault(); - ret += test_munlockall(); - ret += test_lock_onfault_of_present(); - ret += test_vma_management(true); - ret += test_mlockall(test_vma_management); - return ret; -} diff --git a/tools/testing/selftests/vm/mlock2.h b/tools/testing/selftests/vm/mlock2.h deleted file mode 100644 index 2a6e76c226bc..000000000000 --- a/tools/testing/selftests/vm/mlock2.h +++ /dev/null @@ -1,63 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include -#include -#include -#include - -#ifndef MLOCK_ONFAULT -#define MLOCK_ONFAULT 1 -#endif - -#ifndef MCL_ONFAULT -#define MCL_ONFAULT (MCL_FUTURE << 1) -#endif - -static int mlock2_(void *start, size_t len, int flags) -{ -#ifdef __NR_mlock2 - return syscall(__NR_mlock2, start, len, flags); -#else - errno = ENOSYS; - return -1; -#endif -} - -static FILE *seek_to_smaps_entry(unsigned long addr) -{ - FILE *file; - char *line = NULL; - size_t size = 0; - unsigned long start, end; - char perms[5]; - unsigned long offset; - char dev[32]; - unsigned long inode; - char path[BUFSIZ]; - - file = fopen("/proc/self/smaps", "r"); - if (!file) { - perror("fopen smaps"); - _exit(1); - } - - while (getline(&line, &size, file) > 0) { - if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n", - &start, &end, perms, &offset, dev, &inode, path) < 6) - goto next; - - if (start <= addr && addr < end) - goto out; - -next: - free(line); - line = NULL; - size = 0; - } - - fclose(file); - file = NULL; - -out: - free(line); - return file; -} diff --git a/tools/testing/selftests/vm/mrelease_test.c b/tools/testing/selftests/vm/mrelease_test.c deleted file mode 100644 index 6c62966ab5db..000000000000 --- a/tools/testing/selftests/vm/mrelease_test.c +++ /dev/null @@ -1,206 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2022 Google LLC - */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include - -#include "util.h" - -#include "../kselftest.h" - -#ifndef __NR_pidfd_open -#define __NR_pidfd_open -1 -#endif - -#ifndef __NR_process_mrelease -#define __NR_process_mrelease -1 -#endif - -#define MB(x) (x << 20) -#define MAX_SIZE_MB 1024 - -static int alloc_noexit(unsigned long nr_pages, int pipefd) -{ - int ppid = getppid(); - int timeout = 10; /* 10sec timeout to get killed */ - unsigned long i; - char *buf; - - buf = (char *)mmap(NULL, nr_pages * PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANON, 0, 0); - if (buf == MAP_FAILED) { - perror("mmap failed, halting the test"); - return KSFT_FAIL; - } - - for (i = 0; i < nr_pages; i++) - *((unsigned long *)(buf + (i * PAGE_SIZE))) = i; - - /* Signal the parent that the child is ready */ - if (write(pipefd, "", 1) < 0) { - perror("write"); - return KSFT_FAIL; - } - - /* Wait to be killed (when reparenting happens) */ - while (getppid() == ppid && timeout > 0) { - sleep(1); - timeout--; - } - - munmap(buf, nr_pages * PAGE_SIZE); - - return (timeout > 0) ? KSFT_PASS : KSFT_FAIL; -} - -/* The process_mrelease calls in this test are expected to fail */ -static void run_negative_tests(int pidfd) -{ - int res; - /* Test invalid flags. Expect to fail with EINVAL error code. */ - if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) || - errno != EINVAL) { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("process_mrelease with wrong flags"); - exit(res); - } - /* - * Test reaping while process is alive with no pending SIGKILL. - * Expect to fail with EINVAL error code. - */ - if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("process_mrelease on a live process"); - exit(res); - } -} - -static int child_main(int pipefd[], size_t size) -{ - int res; - - /* Allocate and fault-in memory and wait to be killed */ - close(pipefd[0]); - res = alloc_noexit(MB(size) / PAGE_SIZE, pipefd[1]); - close(pipefd[1]); - return res; -} - -int main(void) -{ - int pipefd[2], pidfd; - bool success, retry; - size_t size; - pid_t pid; - char byte; - int res; - - /* Test a wrong pidfd */ - if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("process_mrelease with wrong pidfd"); - exit(res); - } - - /* Start the test with 1MB child memory allocation */ - size = 1; -retry: - /* - * Pipe for the child to signal when it's done allocating - * memory - */ - if (pipe(pipefd)) { - perror("pipe"); - exit(KSFT_FAIL); - } - pid = fork(); - if (pid < 0) { - perror("fork"); - close(pipefd[0]); - close(pipefd[1]); - exit(KSFT_FAIL); - } - - if (pid == 0) { - /* Child main routine */ - res = child_main(pipefd, size); - exit(res); - } - - /* - * Parent main routine: - * Wait for the child to finish allocations, then kill and reap - */ - close(pipefd[1]); - /* Block until the child is ready */ - res = read(pipefd[0], &byte, 1); - close(pipefd[0]); - if (res < 0) { - perror("read"); - if (!kill(pid, SIGKILL)) - waitpid(pid, NULL, 0); - exit(KSFT_FAIL); - } - - pidfd = syscall(__NR_pidfd_open, pid, 0); - if (pidfd < 0) { - perror("pidfd_open"); - if (!kill(pid, SIGKILL)) - waitpid(pid, NULL, 0); - exit(KSFT_FAIL); - } - - /* Run negative tests which require a live child */ - run_negative_tests(pidfd); - - if (kill(pid, SIGKILL)) { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("kill"); - exit(res); - } - - success = (syscall(__NR_process_mrelease, pidfd, 0) == 0); - if (!success) { - /* - * If we failed to reap because the child exited too soon, - * before we could call process_mrelease. Double child's memory - * which causes it to spend more time on cleanup and increases - * our chances of reaping its memory before it exits. - * Retry until we succeed or reach MAX_SIZE_MB. - */ - if (errno == ESRCH) { - retry = (size <= MAX_SIZE_MB); - } else { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("process_mrelease"); - waitpid(pid, NULL, 0); - exit(res); - } - } - - /* Cleanup to prevent zombies */ - if (waitpid(pid, NULL, 0) < 0) { - perror("waitpid"); - exit(KSFT_FAIL); - } - close(pidfd); - - if (!success) { - if (retry) { - size *= 2; - goto retry; - } - printf("All process_mrelease attempts failed!\n"); - exit(KSFT_FAIL); - } - - printf("Success reaping a child with %zuMB of memory allocations\n", - size); - return KSFT_PASS; -} diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/vm/mremap_dontunmap.c deleted file mode 100644 index f01dc4a85b0b..000000000000 --- a/tools/testing/selftests/vm/mremap_dontunmap.c +++ /dev/null @@ -1,364 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* - * Tests for mremap w/ MREMAP_DONTUNMAP. - * - * Copyright 2020, Brian Geffon - */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" - -#ifndef MREMAP_DONTUNMAP -#define MREMAP_DONTUNMAP 4 -#endif - -unsigned long page_size; -char *page_buffer; - -static void dump_maps(void) -{ - char cmd[32]; - - snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid()); - system(cmd); -} - -#define BUG_ON(condition, description) \ - do { \ - if (condition) { \ - fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \ - __LINE__, (description), strerror(errno)); \ - dump_maps(); \ - exit(1); \ - } \ - } while (0) - -// Try a simple operation for to "test" for kernel support this prevents -// reporting tests as failed when it's run on an older kernel. -static int kernel_support_for_mremap_dontunmap() -{ - int ret = 0; - unsigned long num_pages = 1; - void *source_mapping = mmap(NULL, num_pages * page_size, PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(source_mapping == MAP_FAILED, "mmap"); - - // This simple remap should only fail if MREMAP_DONTUNMAP isn't - // supported. - void *dest_mapping = - mremap(source_mapping, num_pages * page_size, num_pages * page_size, - MREMAP_DONTUNMAP | MREMAP_MAYMOVE, 0); - if (dest_mapping == MAP_FAILED) { - ret = errno; - } else { - BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, - "unable to unmap destination mapping"); - } - - BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, - "unable to unmap source mapping"); - return ret; -} - -// This helper will just validate that an entire mapping contains the expected -// byte. -static int check_region_contains_byte(void *addr, unsigned long size, char byte) -{ - BUG_ON(size & (page_size - 1), - "check_region_contains_byte expects page multiples"); - BUG_ON((unsigned long)addr & (page_size - 1), - "check_region_contains_byte expects page alignment"); - - memset(page_buffer, byte, page_size); - - unsigned long num_pages = size / page_size; - unsigned long i; - - // Compare each page checking that it contains our expected byte. - for (i = 0; i < num_pages; ++i) { - int ret = - memcmp(addr + (i * page_size), page_buffer, page_size); - if (ret) { - return ret; - } - } - - return 0; -} - -// this test validates that MREMAP_DONTUNMAP moves the pagetables while leaving -// the source mapping mapped. -static void mremap_dontunmap_simple() -{ - unsigned long num_pages = 5; - - void *source_mapping = - mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(source_mapping == MAP_FAILED, "mmap"); - - memset(source_mapping, 'a', num_pages * page_size); - - // Try to just move the whole mapping anywhere (not fixed). - void *dest_mapping = - mremap(source_mapping, num_pages * page_size, num_pages * page_size, - MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); - BUG_ON(dest_mapping == MAP_FAILED, "mremap"); - - // Validate that the pages have been moved, we know they were moved if - // the dest_mapping contains a's. - BUG_ON(check_region_contains_byte - (dest_mapping, num_pages * page_size, 'a') != 0, - "pages did not migrate"); - BUG_ON(check_region_contains_byte - (source_mapping, num_pages * page_size, 0) != 0, - "source should have no ptes"); - - BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, - "unable to unmap destination mapping"); - BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, - "unable to unmap source mapping"); -} - -// This test validates that MREMAP_DONTUNMAP on a shared mapping works as expected. -static void mremap_dontunmap_simple_shmem() -{ - unsigned long num_pages = 5; - - int mem_fd = memfd_create("memfd", MFD_CLOEXEC); - BUG_ON(mem_fd < 0, "memfd_create"); - - BUG_ON(ftruncate(mem_fd, num_pages * page_size) < 0, - "ftruncate"); - - void *source_mapping = - mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, - MAP_FILE | MAP_SHARED, mem_fd, 0); - BUG_ON(source_mapping == MAP_FAILED, "mmap"); - - BUG_ON(close(mem_fd) < 0, "close"); - - memset(source_mapping, 'a', num_pages * page_size); - - // Try to just move the whole mapping anywhere (not fixed). - void *dest_mapping = - mremap(source_mapping, num_pages * page_size, num_pages * page_size, - MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); - if (dest_mapping == MAP_FAILED && errno == EINVAL) { - // Old kernel which doesn't support MREMAP_DONTUNMAP on shmem. - BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, - "unable to unmap source mapping"); - return; - } - - BUG_ON(dest_mapping == MAP_FAILED, "mremap"); - - // Validate that the pages have been moved, we know they were moved if - // the dest_mapping contains a's. - BUG_ON(check_region_contains_byte - (dest_mapping, num_pages * page_size, 'a') != 0, - "pages did not migrate"); - - // Because the region is backed by shmem, we will actually see the same - // memory at the source location still. - BUG_ON(check_region_contains_byte - (source_mapping, num_pages * page_size, 'a') != 0, - "source should have no ptes"); - - BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, - "unable to unmap destination mapping"); - BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, - "unable to unmap source mapping"); -} - -// This test validates MREMAP_DONTUNMAP will move page tables to a specific -// destination using MREMAP_FIXED, also while validating that the source -// remains intact. -static void mremap_dontunmap_simple_fixed() -{ - unsigned long num_pages = 5; - - // Since we want to guarantee that we can remap to a point, we will - // create a mapping up front. - void *dest_mapping = - mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(dest_mapping == MAP_FAILED, "mmap"); - memset(dest_mapping, 'X', num_pages * page_size); - - void *source_mapping = - mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(source_mapping == MAP_FAILED, "mmap"); - memset(source_mapping, 'a', num_pages * page_size); - - void *remapped_mapping = - mremap(source_mapping, num_pages * page_size, num_pages * page_size, - MREMAP_FIXED | MREMAP_DONTUNMAP | MREMAP_MAYMOVE, - dest_mapping); - BUG_ON(remapped_mapping == MAP_FAILED, "mremap"); - BUG_ON(remapped_mapping != dest_mapping, - "mremap should have placed the remapped mapping at dest_mapping"); - - // The dest mapping will have been unmap by mremap so we expect the Xs - // to be gone and replaced with a's. - BUG_ON(check_region_contains_byte - (dest_mapping, num_pages * page_size, 'a') != 0, - "pages did not migrate"); - - // And the source mapping will have had its ptes dropped. - BUG_ON(check_region_contains_byte - (source_mapping, num_pages * page_size, 0) != 0, - "source should have no ptes"); - - BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, - "unable to unmap destination mapping"); - BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, - "unable to unmap source mapping"); -} - -// This test validates that we can MREMAP_DONTUNMAP for a portion of an -// existing mapping. -static void mremap_dontunmap_partial_mapping() -{ - /* - * source mapping: - * -------------- - * | aaaaaaaaaa | - * -------------- - * to become: - * -------------- - * | aaaaa00000 | - * -------------- - * With the destination mapping containing 5 pages of As. - * --------- - * | aaaaa | - * --------- - */ - unsigned long num_pages = 10; - void *source_mapping = - mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(source_mapping == MAP_FAILED, "mmap"); - memset(source_mapping, 'a', num_pages * page_size); - - // We will grab the last 5 pages of the source and move them. - void *dest_mapping = - mremap(source_mapping + (5 * page_size), 5 * page_size, - 5 * page_size, - MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); - BUG_ON(dest_mapping == MAP_FAILED, "mremap"); - - // We expect the first 5 pages of the source to contain a's and the - // final 5 pages to contain zeros. - BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 'a') != - 0, "first 5 pages of source should have original pages"); - BUG_ON(check_region_contains_byte - (source_mapping + (5 * page_size), 5 * page_size, 0) != 0, - "final 5 pages of source should have no ptes"); - - // Finally we expect the destination to have 5 pages worth of a's. - BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != - 0, "dest mapping should contain ptes from the source"); - - BUG_ON(munmap(dest_mapping, 5 * page_size) == -1, - "unable to unmap destination mapping"); - BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, - "unable to unmap source mapping"); -} - -// This test validates that we can remap over only a portion of a mapping. -static void mremap_dontunmap_partial_mapping_overwrite(void) -{ - /* - * source mapping: - * --------- - * |aaaaa| - * --------- - * dest mapping initially: - * ----------- - * |XXXXXXXXXX| - * ------------ - * Source to become: - * --------- - * |00000| - * --------- - * With the destination mapping containing 5 pages of As. - * ------------ - * |aaaaaXXXXX| - * ------------ - */ - void *source_mapping = - mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(source_mapping == MAP_FAILED, "mmap"); - memset(source_mapping, 'a', 5 * page_size); - - void *dest_mapping = - mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(dest_mapping == MAP_FAILED, "mmap"); - memset(dest_mapping, 'X', 10 * page_size); - - // We will grab the last 5 pages of the source and move them. - void *remapped_mapping = - mremap(source_mapping, 5 * page_size, - 5 * page_size, - MREMAP_DONTUNMAP | MREMAP_MAYMOVE | MREMAP_FIXED, dest_mapping); - BUG_ON(dest_mapping == MAP_FAILED, "mremap"); - BUG_ON(dest_mapping != remapped_mapping, "expected to remap to dest_mapping"); - - BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 0) != - 0, "first 5 pages of source should have no ptes"); - - // Finally we expect the destination to have 5 pages worth of a's. - BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != 0, - "dest mapping should contain ptes from the source"); - - // Finally the last 5 pages shouldn't have been touched. - BUG_ON(check_region_contains_byte(dest_mapping + (5 * page_size), - 5 * page_size, 'X') != 0, - "dest mapping should have retained the last 5 pages"); - - BUG_ON(munmap(dest_mapping, 10 * page_size) == -1, - "unable to unmap destination mapping"); - BUG_ON(munmap(source_mapping, 5 * page_size) == -1, - "unable to unmap source mapping"); -} - -int main(void) -{ - page_size = sysconf(_SC_PAGE_SIZE); - - // test for kernel support for MREMAP_DONTUNMAP skipping the test if - // not. - if (kernel_support_for_mremap_dontunmap() != 0) { - printf("No kernel support for MREMAP_DONTUNMAP\n"); - return KSFT_SKIP; - } - - // Keep a page sized buffer around for when we need it. - page_buffer = - mmap(NULL, page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(page_buffer == MAP_FAILED, "unable to mmap a page."); - - mremap_dontunmap_simple(); - mremap_dontunmap_simple_shmem(); - mremap_dontunmap_simple_fixed(); - mremap_dontunmap_partial_mapping(); - mremap_dontunmap_partial_mapping_overwrite(); - - BUG_ON(munmap(page_buffer, page_size) == -1, - "unable to unmap page buffer"); - - printf("OK\n"); - return 0; -} diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c deleted file mode 100644 index 9496346973d4..000000000000 --- a/tools/testing/selftests/vm/mremap_test.c +++ /dev/null @@ -1,475 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2020 Google LLC - */ -#define _GNU_SOURCE - -#include -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" - -#define EXPECT_SUCCESS 0 -#define EXPECT_FAILURE 1 -#define NON_OVERLAPPING 0 -#define OVERLAPPING 1 -#define NS_PER_SEC 1000000000ULL -#define VALIDATION_DEFAULT_THRESHOLD 4 /* 4MB */ -#define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */ - -#define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) - -struct config { - unsigned long long src_alignment; - unsigned long long dest_alignment; - unsigned long long region_size; - int overlapping; -}; - -struct test { - const char *name; - struct config config; - int expect_failure; -}; - -enum { - _1KB = 1ULL << 10, /* 1KB -> not page aligned */ - _4KB = 4ULL << 10, - _8KB = 8ULL << 10, - _1MB = 1ULL << 20, - _2MB = 2ULL << 20, - _4MB = 4ULL << 20, - _1GB = 1ULL << 30, - _2GB = 2ULL << 30, - PMD = _2MB, - PUD = _1GB, -}; - -#define PTE page_size - -#define MAKE_TEST(source_align, destination_align, size, \ - overlaps, should_fail, test_name) \ -(struct test){ \ - .name = test_name, \ - .config = { \ - .src_alignment = source_align, \ - .dest_alignment = destination_align, \ - .region_size = size, \ - .overlapping = overlaps, \ - }, \ - .expect_failure = should_fail \ -} - -/* - * Returns false if the requested remap region overlaps with an - * existing mapping (e.g text, stack) else returns true. - */ -static bool is_remap_region_valid(void *addr, unsigned long long size) -{ - void *remap_addr = NULL; - bool ret = true; - - /* Use MAP_FIXED_NOREPLACE flag to ensure region is not mapped */ - remap_addr = mmap(addr, size, PROT_READ | PROT_WRITE, - MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, - -1, 0); - - if (remap_addr == MAP_FAILED) { - if (errno == EEXIST) - ret = false; - } else { - munmap(remap_addr, size); - } - - return ret; -} - -/* Returns mmap_min_addr sysctl tunable from procfs */ -static unsigned long long get_mmap_min_addr(void) -{ - FILE *fp; - int n_matched; - static unsigned long long addr; - - if (addr) - return addr; - - fp = fopen("/proc/sys/vm/mmap_min_addr", "r"); - if (fp == NULL) { - ksft_print_msg("Failed to open /proc/sys/vm/mmap_min_addr: %s\n", - strerror(errno)); - exit(KSFT_SKIP); - } - - n_matched = fscanf(fp, "%llu", &addr); - if (n_matched != 1) { - ksft_print_msg("Failed to read /proc/sys/vm/mmap_min_addr: %s\n", - strerror(errno)); - fclose(fp); - exit(KSFT_SKIP); - } - - fclose(fp); - return addr; -} - -/* - * This test validates that merge is called when expanding a mapping. - * Mapping containing three pages is created, middle page is unmapped - * and then the mapping containing the first page is expanded so that - * it fills the created hole. The two parts should merge creating - * single mapping with three pages. - */ -static void mremap_expand_merge(unsigned long page_size) -{ - char *test_name = "mremap expand merge"; - FILE *fp; - char *line = NULL; - size_t len = 0; - bool success = false; - char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - munmap(start + page_size, page_size); - mremap(start, page_size, 2 * page_size, 0); - - fp = fopen("/proc/self/maps", "r"); - if (fp == NULL) { - ksft_test_result_fail("%s\n", test_name); - return; - } - - while (getline(&line, &len, fp) != -1) { - char *first = strtok(line, "- "); - void *first_val = (void *)strtol(first, NULL, 16); - char *second = strtok(NULL, "- "); - void *second_val = (void *) strtol(second, NULL, 16); - - if (first_val == start && second_val == start + 3 * page_size) { - success = true; - break; - } - } - if (success) - ksft_test_result_pass("%s\n", test_name); - else - ksft_test_result_fail("%s\n", test_name); - fclose(fp); -} - -/* - * Returns the start address of the mapping on success, else returns - * NULL on failure. - */ -static void *get_source_mapping(struct config c) -{ - unsigned long long addr = 0ULL; - void *src_addr = NULL; - unsigned long long mmap_min_addr; - - mmap_min_addr = get_mmap_min_addr(); - -retry: - addr += c.src_alignment; - if (addr < mmap_min_addr) - goto retry; - - src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE, - MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, - -1, 0); - if (src_addr == MAP_FAILED) { - if (errno == EPERM || errno == EEXIST) - goto retry; - goto error; - } - /* - * Check that the address is aligned to the specified alignment. - * Addresses which have alignments that are multiples of that - * specified are not considered valid. For instance, 1GB address is - * 2MB-aligned, however it will not be considered valid for a - * requested alignment of 2MB. This is done to reduce coincidental - * alignment in the tests. - */ - if (((unsigned long long) src_addr & (c.src_alignment - 1)) || - !((unsigned long long) src_addr & c.src_alignment)) { - munmap(src_addr, c.region_size); - goto retry; - } - - if (!src_addr) - goto error; - - return src_addr; -error: - ksft_print_msg("Failed to map source region: %s\n", - strerror(errno)); - return NULL; -} - -/* Returns the time taken for the remap on success else returns -1. */ -static long long remap_region(struct config c, unsigned int threshold_mb, - char pattern_seed) -{ - void *addr, *src_addr, *dest_addr; - unsigned long long i; - struct timespec t_start = {0, 0}, t_end = {0, 0}; - long long start_ns, end_ns, align_mask, ret, offset; - unsigned long long threshold; - - if (threshold_mb == VALIDATION_NO_THRESHOLD) - threshold = c.region_size; - else - threshold = MIN(threshold_mb * _1MB, c.region_size); - - src_addr = get_source_mapping(c); - if (!src_addr) { - ret = -1; - goto out; - } - - /* Set byte pattern */ - srand(pattern_seed); - for (i = 0; i < threshold; i++) - memset((char *) src_addr + i, (char) rand(), 1); - - /* Mask to zero out lower bits of address for alignment */ - align_mask = ~(c.dest_alignment - 1); - /* Offset of destination address from the end of the source region */ - offset = (c.overlapping) ? -c.dest_alignment : c.dest_alignment; - addr = (void *) (((unsigned long long) src_addr + c.region_size - + offset) & align_mask); - - /* See comment in get_source_mapping() */ - if (!((unsigned long long) addr & c.dest_alignment)) - addr = (void *) ((unsigned long long) addr | c.dest_alignment); - - /* Don't destroy existing mappings unless expected to overlap */ - while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) { - /* Check for unsigned overflow */ - if (addr + c.dest_alignment < addr) { - ksft_print_msg("Couldn't find a valid region to remap to\n"); - ret = -1; - goto out; - } - addr += c.dest_alignment; - } - - clock_gettime(CLOCK_MONOTONIC, &t_start); - dest_addr = mremap(src_addr, c.region_size, c.region_size, - MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr); - clock_gettime(CLOCK_MONOTONIC, &t_end); - - if (dest_addr == MAP_FAILED) { - ksft_print_msg("mremap failed: %s\n", strerror(errno)); - ret = -1; - goto clean_up_src; - } - - /* Verify byte pattern after remapping */ - srand(pattern_seed); - for (i = 0; i < threshold; i++) { - char c = (char) rand(); - - if (((char *) dest_addr)[i] != c) { - ksft_print_msg("Data after remap doesn't match at offset %d\n", - i); - ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, - ((char *) dest_addr)[i] & 0xff); - ret = -1; - goto clean_up_dest; - } - } - - start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec; - end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec; - ret = end_ns - start_ns; - -/* - * Since the destination address is specified using MREMAP_FIXED, subsequent - * mremap will unmap any previous mapping at the address range specified by - * dest_addr and region_size. This significantly affects the remap time of - * subsequent tests. So we clean up mappings after each test. - */ -clean_up_dest: - munmap(dest_addr, c.region_size); -clean_up_src: - munmap(src_addr, c.region_size); -out: - return ret; -} - -static void run_mremap_test_case(struct test test_case, int *failures, - unsigned int threshold_mb, - unsigned int pattern_seed) -{ - long long remap_time = remap_region(test_case.config, threshold_mb, - pattern_seed); - - if (remap_time < 0) { - if (test_case.expect_failure) - ksft_test_result_xfail("%s\n\tExpected mremap failure\n", - test_case.name); - else { - ksft_test_result_fail("%s\n", test_case.name); - *failures += 1; - } - } else { - /* - * Comparing mremap time is only applicable if entire region - * was faulted in. - */ - if (threshold_mb == VALIDATION_NO_THRESHOLD || - test_case.config.region_size <= threshold_mb * _1MB) - ksft_test_result_pass("%s\n\tmremap time: %12lldns\n", - test_case.name, remap_time); - else - ksft_test_result_pass("%s\n", test_case.name); - } -} - -static void usage(const char *cmd) -{ - fprintf(stderr, - "Usage: %s [[-t ] [-p ]]\n" - "-t\t only validate threshold_mb of the remapped region\n" - " \t if 0 is supplied no threshold is used; all tests\n" - " \t are run and remapped regions validated fully.\n" - " \t The default threshold used is 4MB.\n" - "-p\t provide a seed to generate the random pattern for\n" - " \t validating the remapped region.\n", cmd); -} - -static int parse_args(int argc, char **argv, unsigned int *threshold_mb, - unsigned int *pattern_seed) -{ - const char *optstr = "t:p:"; - int opt; - - while ((opt = getopt(argc, argv, optstr)) != -1) { - switch (opt) { - case 't': - *threshold_mb = atoi(optarg); - break; - case 'p': - *pattern_seed = atoi(optarg); - break; - default: - usage(argv[0]); - return -1; - } - } - - if (optind < argc) { - usage(argv[0]); - return -1; - } - - return 0; -} - -#define MAX_TEST 13 -#define MAX_PERF_TEST 3 -int main(int argc, char **argv) -{ - int failures = 0; - int i, run_perf_tests; - unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; - unsigned int pattern_seed; - int num_expand_tests = 1; - struct test test_cases[MAX_TEST]; - struct test perf_test_cases[MAX_PERF_TEST]; - int page_size; - time_t t; - - pattern_seed = (unsigned int) time(&t); - - if (parse_args(argc, argv, &threshold_mb, &pattern_seed) < 0) - exit(EXIT_FAILURE); - - ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n", - threshold_mb, pattern_seed); - - page_size = sysconf(_SC_PAGESIZE); - - /* Expected mremap failures */ - test_cases[0] = MAKE_TEST(page_size, page_size, page_size, - OVERLAPPING, EXPECT_FAILURE, - "mremap - Source and Destination Regions Overlapping"); - - test_cases[1] = MAKE_TEST(page_size, page_size/4, page_size, - NON_OVERLAPPING, EXPECT_FAILURE, - "mremap - Destination Address Misaligned (1KB-aligned)"); - test_cases[2] = MAKE_TEST(page_size/4, page_size, page_size, - NON_OVERLAPPING, EXPECT_FAILURE, - "mremap - Source Address Misaligned (1KB-aligned)"); - - /* Src addr PTE aligned */ - test_cases[3] = MAKE_TEST(PTE, PTE, PTE * 2, - NON_OVERLAPPING, EXPECT_SUCCESS, - "8KB mremap - Source PTE-aligned, Destination PTE-aligned"); - - /* Src addr 1MB aligned */ - test_cases[4] = MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2MB mremap - Source 1MB-aligned, Destination PTE-aligned"); - test_cases[5] = MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned"); - - /* Src addr PMD aligned */ - test_cases[6] = MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "4MB mremap - Source PMD-aligned, Destination PTE-aligned"); - test_cases[7] = MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "4MB mremap - Source PMD-aligned, Destination 1MB-aligned"); - test_cases[8] = MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "4MB mremap - Source PMD-aligned, Destination PMD-aligned"); - - /* Src addr PUD aligned */ - test_cases[9] = MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2GB mremap - Source PUD-aligned, Destination PTE-aligned"); - test_cases[10] = MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2GB mremap - Source PUD-aligned, Destination 1MB-aligned"); - test_cases[11] = MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2GB mremap - Source PUD-aligned, Destination PMD-aligned"); - test_cases[12] = MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2GB mremap - Source PUD-aligned, Destination PUD-aligned"); - - perf_test_cases[0] = MAKE_TEST(page_size, page_size, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "1GB mremap - Source PTE-aligned, Destination PTE-aligned"); - /* - * mremap 1GB region - Page table level aligned time - * comparison. - */ - perf_test_cases[1] = MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "1GB mremap - Source PMD-aligned, Destination PMD-aligned"); - perf_test_cases[2] = MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "1GB mremap - Source PUD-aligned, Destination PUD-aligned"); - - run_perf_tests = (threshold_mb == VALIDATION_NO_THRESHOLD) || - (threshold_mb * _1MB >= _1GB); - - ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ? - ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests); - - for (i = 0; i < ARRAY_SIZE(test_cases); i++) - run_mremap_test_case(test_cases[i], &failures, threshold_mb, - pattern_seed); - - mremap_expand_merge(page_size); - - if (run_perf_tests) { - ksft_print_msg("\n%s\n", - "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); - for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++) - run_mremap_test_case(perf_test_cases[i], &failures, - threshold_mb, pattern_seed); - } - - if (failures > 0) - ksft_exit_fail(); - else - ksft_exit_pass(); -} diff --git a/tools/testing/selftests/vm/on-fault-limit.c b/tools/testing/selftests/vm/on-fault-limit.c deleted file mode 100644 index 634d87dfb2a4..000000000000 --- a/tools/testing/selftests/vm/on-fault-limit.c +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include - -#ifndef MCL_ONFAULT -#define MCL_ONFAULT (MCL_FUTURE << 1) -#endif - -static int test_limit(void) -{ - int ret = 1; - struct rlimit lims; - void *map; - - if (getrlimit(RLIMIT_MEMLOCK, &lims)) { - perror("getrlimit"); - return ret; - } - - if (mlockall(MCL_ONFAULT | MCL_FUTURE)) { - perror("mlockall"); - return ret; - } - - map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0); - if (map != MAP_FAILED) - printf("mmap should have failed, but didn't\n"); - else { - ret = 0; - munmap(map, 2 * lims.rlim_max); - } - - munlockall(); - return ret; -} - -int main(int argc, char **argv) -{ - int ret = 0; - - ret += test_limit(); - return ret; -} diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h deleted file mode 100644 index 92f3be3dd8e5..000000000000 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ /dev/null @@ -1,226 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PKEYS_HELPER_H -#define _PKEYS_HELPER_H -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" - -/* Define some kernel-like types */ -#define u8 __u8 -#define u16 __u16 -#define u32 __u32 -#define u64 __u64 - -#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) - -#ifndef DEBUG_LEVEL -#define DEBUG_LEVEL 0 -#endif -#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 -extern int dprint_in_signal; -extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; - -extern int test_nr; -extern int iteration_nr; - -#ifdef __GNUC__ -__attribute__((format(printf, 1, 2))) -#endif -static inline void sigsafe_printf(const char *format, ...) -{ - va_list ap; - - if (!dprint_in_signal) { - va_start(ap, format); - vprintf(format, ap); - va_end(ap); - } else { - int ret; - /* - * No printf() functions are signal-safe. - * They deadlock easily. Write the format - * string to get some output, even if - * incomplete. - */ - ret = write(1, format, strlen(format)); - if (ret < 0) - exit(1); - } -} -#define dprintf_level(level, args...) do { \ - if (level <= DEBUG_LEVEL) \ - sigsafe_printf(args); \ -} while (0) -#define dprintf0(args...) dprintf_level(0, args) -#define dprintf1(args...) dprintf_level(1, args) -#define dprintf2(args...) dprintf_level(2, args) -#define dprintf3(args...) dprintf_level(3, args) -#define dprintf4(args...) dprintf_level(4, args) - -extern void abort_hooks(void); -#define pkey_assert(condition) do { \ - if (!(condition)) { \ - dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ - __FILE__, __LINE__, \ - test_nr, iteration_nr); \ - dprintf0("errno at assert: %d", errno); \ - abort_hooks(); \ - exit(__LINE__); \ - } \ -} while (0) - -__attribute__((noinline)) int read_ptr(int *ptr); -void expected_pkey_fault(int pkey); -int sys_pkey_alloc(unsigned long flags, unsigned long init_val); -int sys_pkey_free(unsigned long pkey); -int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, - unsigned long pkey); -void record_pkey_malloc(void *ptr, long size, int prot); - -#if defined(__i386__) || defined(__x86_64__) /* arch */ -#include "pkey-x86.h" -#elif defined(__powerpc64__) /* arch */ -#include "pkey-powerpc.h" -#else /* arch */ -#error Architecture not supported -#endif /* arch */ - -#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) - -static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) -{ - u32 shift = pkey_bit_position(pkey); - /* mask out bits from pkey in old value */ - reg &= ~((u64)PKEY_MASK << shift); - /* OR in new bits for pkey */ - reg |= (flags & PKEY_MASK) << shift; - return reg; -} - -static inline u64 get_pkey_bits(u64 reg, int pkey) -{ - u32 shift = pkey_bit_position(pkey); - /* - * shift down the relevant bits to the lowest two, then - * mask off all the other higher bits - */ - return ((reg >> shift) & PKEY_MASK); -} - -extern u64 shadow_pkey_reg; - -static inline u64 _read_pkey_reg(int line) -{ - u64 pkey_reg = __read_pkey_reg(); - - dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx" - " shadow: %016llx\n", - line, pkey_reg, shadow_pkey_reg); - assert(pkey_reg == shadow_pkey_reg); - - return pkey_reg; -} - -#define read_pkey_reg() _read_pkey_reg(__LINE__) - -static inline void write_pkey_reg(u64 pkey_reg) -{ - dprintf4("%s() changing %016llx to %016llx\n", __func__, - __read_pkey_reg(), pkey_reg); - /* will do the shadow check for us: */ - read_pkey_reg(); - __write_pkey_reg(pkey_reg); - shadow_pkey_reg = pkey_reg; - dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__, - pkey_reg, __read_pkey_reg()); -} - -/* - * These are technically racy. since something could - * change PKEY register between the read and the write. - */ -static inline void __pkey_access_allow(int pkey, int do_allow) -{ - u64 pkey_reg = read_pkey_reg(); - int bit = pkey * 2; - - if (do_allow) - pkey_reg &= (1<si_pkey; -#else - return (u32 *)(((u8 *)si) + si_pkey_offset); -#endif -} - -static inline int kernel_has_pkeys(void) -{ - /* try allocating a key and see if it succeeds */ - int ret = sys_pkey_alloc(0, 0); - if (ret <= 0) { - return 0; - } - sys_pkey_free(ret); - return 1; -} - -static inline int is_pkeys_supported(void) -{ - /* check if the cpu supports pkeys */ - if (!cpu_has_pkeys()) { - dprintf1("SKIP: %s: no CPU support\n", __func__); - return 0; - } - - /* check if the kernel supports pkeys */ - if (!kernel_has_pkeys()) { - dprintf1("SKIP: %s: no kernel support\n", __func__); - return 0; - } - - return 1; -} - -#endif /* _PKEYS_HELPER_H */ diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h deleted file mode 100644 index 1ebb586b2fbc..000000000000 --- a/tools/testing/selftests/vm/pkey-powerpc.h +++ /dev/null @@ -1,133 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _PKEYS_POWERPC_H -#define _PKEYS_POWERPC_H - -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 386 -#endif -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 384 -# define SYS_pkey_free 385 -#endif -#define REG_IP_IDX PT_NIP -#define REG_TRAPNO PT_TRAP -#define gregs gp_regs -#define fpregs fp_regs -#define si_pkey_offset 0x20 - -#undef PKEY_DISABLE_ACCESS -#define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */ - -#undef PKEY_DISABLE_WRITE -#define PKEY_DISABLE_WRITE 0x2 - -#define NR_PKEYS 32 -#define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey - and 24 other keys that cannot be - represented in the PTE */ -#define NR_RESERVED_PKEYS_64K_3KEYS 3 /* PowerNV and KVM: pkey-0, - pkey-1 and exec-only key */ -#define NR_RESERVED_PKEYS_64K_4KEYS 4 /* PowerVM: pkey-0, pkey-1, - pkey-31 and exec-only key */ -#define PKEY_BITS_PER_PKEY 2 -#define HPAGE_SIZE (1UL << 24) -#define PAGE_SIZE sysconf(_SC_PAGESIZE) - -static inline u32 pkey_bit_position(int pkey) -{ - return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY; -} - -static inline u64 __read_pkey_reg(void) -{ - u64 pkey_reg; - - asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg)); - - return pkey_reg; -} - -static inline void __write_pkey_reg(u64 pkey_reg) -{ - u64 amr = pkey_reg; - - dprintf4("%s() changing %016llx to %016llx\n", - __func__, __read_pkey_reg(), pkey_reg); - - asm volatile("isync; mtspr 0xd, %0; isync" - : : "r" ((unsigned long)(amr)) : "memory"); - - dprintf4("%s() pkey register after changing %016llx to %016llx\n", - __func__, __read_pkey_reg(), pkey_reg); -} - -static inline int cpu_has_pkeys(void) -{ - /* No simple way to determine this */ - return 1; -} - -static inline bool arch_is_powervm() -{ - struct stat buf; - - if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) && - (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) && - (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) ) - return true; - - return false; -} - -static inline int get_arch_reserved_keys(void) -{ - if (sysconf(_SC_PAGESIZE) == 4096) - return NR_RESERVED_PKEYS_4K; - else - if (arch_is_powervm()) - return NR_RESERVED_PKEYS_64K_4KEYS; - else - return NR_RESERVED_PKEYS_64K_3KEYS; -} - -void expect_fault_on_read_execonly_key(void *p1, int pkey) -{ - /* - * powerpc does not allow userspace to change permissions of exec-only - * keys since those keys are not allocated by userspace. The signal - * handler wont be able to reset the permissions, which means the code - * will infinitely continue to segfault here. - */ - return; -} - -/* 4-byte instructions * 16384 = 64K page */ -#define __page_o_noops() asm(".rept 16384 ; nop; .endr") - -void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) -{ - void *ptr; - int ret; - - dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, - size, prot, pkey); - pkey_assert(pkey < NR_PKEYS); - ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - pkey_assert(ptr != (void *)-1); - - ret = syscall(__NR_subpage_prot, ptr, size, NULL); - if (ret) { - perror("subpage_perm"); - return PTR_ERR_ENOTSUP; - } - - ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); - pkey_assert(!ret); - record_pkey_malloc(ptr, size, prot); - - dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); - return ptr; -} - -#endif /* _PKEYS_POWERPC_H */ diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h deleted file mode 100644 index 72c14cd3ddc7..000000000000 --- a/tools/testing/selftests/vm/pkey-x86.h +++ /dev/null @@ -1,177 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _PKEYS_X86_H -#define _PKEYS_X86_H - -#ifdef __i386__ - -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 380 -#endif - -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 381 -# define SYS_pkey_free 382 -#endif - -#define REG_IP_IDX REG_EIP -#define si_pkey_offset 0x14 - -#else - -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 329 -#endif - -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 330 -# define SYS_pkey_free 331 -#endif - -#define REG_IP_IDX REG_RIP -#define si_pkey_offset 0x20 - -#endif - -#ifndef PKEY_DISABLE_ACCESS -# define PKEY_DISABLE_ACCESS 0x1 -#endif - -#ifndef PKEY_DISABLE_WRITE -# define PKEY_DISABLE_WRITE 0x2 -#endif - -#define NR_PKEYS 16 -#define NR_RESERVED_PKEYS 2 /* pkey-0 and exec-only-pkey */ -#define PKEY_BITS_PER_PKEY 2 -#define HPAGE_SIZE (1UL<<21) -#define PAGE_SIZE 4096 -#define MB (1<<20) - -static inline void __page_o_noops(void) -{ - /* 8-bytes of instruction * 512 bytes = 1 page */ - asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); -} - -static inline u64 __read_pkey_reg(void) -{ - unsigned int eax, edx; - unsigned int ecx = 0; - unsigned pkey_reg; - - asm volatile(".byte 0x0f,0x01,0xee\n\t" - : "=a" (eax), "=d" (edx) - : "c" (ecx)); - pkey_reg = eax; - return pkey_reg; -} - -static inline void __write_pkey_reg(u64 pkey_reg) -{ - unsigned int eax = pkey_reg; - unsigned int ecx = 0; - unsigned int edx = 0; - - dprintf4("%s() changing %016llx to %016llx\n", __func__, - __read_pkey_reg(), pkey_reg); - asm volatile(".byte 0x0f,0x01,0xef\n\t" - : : "a" (eax), "c" (ecx), "d" (edx)); - assert(pkey_reg == __read_pkey_reg()); -} - -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ -#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ - -static inline int cpu_has_pkeys(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - - __cpuid_count(0x7, 0x0, eax, ebx, ecx, edx); - - if (!(ecx & X86_FEATURE_PKU)) { - dprintf2("cpu does not have PKU\n"); - return 0; - } - if (!(ecx & X86_FEATURE_OSPKE)) { - dprintf2("cpu does not have OSPKE\n"); - return 0; - } - return 1; -} - -static inline int cpu_max_xsave_size(void) -{ - unsigned long XSTATE_CPUID = 0xd; - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - - __cpuid_count(XSTATE_CPUID, 0, eax, ebx, ecx, edx); - return ecx; -} - -static inline u32 pkey_bit_position(int pkey) -{ - return pkey * PKEY_BITS_PER_PKEY; -} - -#define XSTATE_PKEY_BIT (9) -#define XSTATE_PKEY 0x200 -#define XSTATE_BV_OFFSET 512 - -int pkey_reg_xstate_offset(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - int xstate_offset; - int xstate_size; - unsigned long XSTATE_CPUID = 0xd; - int leaf; - - /* assume that XSTATE_PKEY is set in XCR0 */ - leaf = XSTATE_PKEY_BIT; - { - __cpuid_count(XSTATE_CPUID, leaf, eax, ebx, ecx, edx); - - if (leaf == XSTATE_PKEY_BIT) { - xstate_offset = ebx; - xstate_size = eax; - } - } - - if (xstate_size == 0) { - printf("could not find size/offset of PKEY in xsave state\n"); - return 0; - } - - return xstate_offset; -} - -static inline int get_arch_reserved_keys(void) -{ - return NR_RESERVED_PKEYS; -} - -void expect_fault_on_read_execonly_key(void *p1, int pkey) -{ - int ptr_contents; - - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pkey_fault(pkey); -} - -void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) -{ - return PTR_ERR_ENOTSUP; -} - -#endif /* _PKEYS_X86_H */ diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c deleted file mode 100644 index 95f403a0c46d..000000000000 --- a/tools/testing/selftests/vm/protection_keys.c +++ /dev/null @@ -1,1788 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst) - * - * There are examples in here of: - * * how to set protection keys on memory - * * how to set/clear bits in pkey registers (the rights register) - * * how to handle SEGV_PKUERR signals and extract pkey-relevant - * information from the siginfo - * - * Things to add: - * make sure KSM and KSM COW breaking works - * prefault pages in at malloc, or not - * protect MPX bounds tables with protection keys? - * make sure VMA splitting/merging is working correctly - * OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys - * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel - * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks - * - * Compile like this: - * gcc -mxsave -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm - * gcc -mxsave -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm - */ -#define _GNU_SOURCE -#define __SANE_USERSPACE_TYPES__ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "pkey-helpers.h" - -int iteration_nr = 1; -int test_nr; - -u64 shadow_pkey_reg; -int dprint_in_signal; -char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; - -void cat_into_file(char *str, char *file) -{ - int fd = open(file, O_RDWR); - int ret; - - dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file); - /* - * these need to be raw because they are called under - * pkey_assert() - */ - if (fd < 0) { - fprintf(stderr, "error opening '%s'\n", str); - perror("error: "); - exit(__LINE__); - } - - ret = write(fd, str, strlen(str)); - if (ret != strlen(str)) { - perror("write to file failed"); - fprintf(stderr, "filename: '%s' str: '%s'\n", file, str); - exit(__LINE__); - } - close(fd); -} - -#if CONTROL_TRACING > 0 -static int warned_tracing; -int tracing_root_ok(void) -{ - if (geteuid() != 0) { - if (!warned_tracing) - fprintf(stderr, "WARNING: not run as root, " - "can not do tracing control\n"); - warned_tracing = 1; - return 0; - } - return 1; -} -#endif - -void tracing_on(void) -{ -#if CONTROL_TRACING > 0 -#define TRACEDIR "/sys/kernel/debug/tracing" - char pidstr[32]; - - if (!tracing_root_ok()) - return; - - sprintf(pidstr, "%d", getpid()); - cat_into_file("0", TRACEDIR "/tracing_on"); - cat_into_file("\n", TRACEDIR "/trace"); - if (1) { - cat_into_file("function_graph", TRACEDIR "/current_tracer"); - cat_into_file("1", TRACEDIR "/options/funcgraph-proc"); - } else { - cat_into_file("nop", TRACEDIR "/current_tracer"); - } - cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid"); - cat_into_file("1", TRACEDIR "/tracing_on"); - dprintf1("enabled tracing\n"); -#endif -} - -void tracing_off(void) -{ -#if CONTROL_TRACING > 0 - if (!tracing_root_ok()) - return; - cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on"); -#endif -} - -void abort_hooks(void) -{ - fprintf(stderr, "running %s()...\n", __func__); - tracing_off(); -#ifdef SLEEP_ON_ABORT - sleep(SLEEP_ON_ABORT); -#endif -} - -/* - * This attempts to have roughly a page of instructions followed by a few - * instructions that do a write, and another page of instructions. That - * way, we are pretty sure that the write is in the second page of - * instructions and has at least a page of padding behind it. - * - * *That* lets us be sure to madvise() away the write instruction, which - * will then fault, which makes sure that the fault code handles - * execute-only memory properly. - */ -#ifdef __powerpc64__ -/* This way, both 4K and 64K alignment are maintained */ -__attribute__((__aligned__(65536))) -#else -__attribute__((__aligned__(PAGE_SIZE))) -#endif -void lots_o_noops_around_write(int *write_to_me) -{ - dprintf3("running %s()\n", __func__); - __page_o_noops(); - /* Assume this happens in the second page of instructions: */ - *write_to_me = __LINE__; - /* pad out by another page: */ - __page_o_noops(); - dprintf3("%s() done\n", __func__); -} - -void dump_mem(void *dumpme, int len_bytes) -{ - char *c = (void *)dumpme; - int i; - - for (i = 0; i < len_bytes; i += sizeof(u64)) { - u64 *ptr = (u64 *)(c + i); - dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr); - } -} - -static u32 hw_pkey_get(int pkey, unsigned long flags) -{ - u64 pkey_reg = __read_pkey_reg(); - - dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", - __func__, pkey, flags, 0, 0); - dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); - - return (u32) get_pkey_bits(pkey_reg, pkey); -} - -static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) -{ - u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u64 old_pkey_reg = __read_pkey_reg(); - u64 new_pkey_reg; - - /* make sure that 'rights' only contains the bits we expect: */ - assert(!(rights & ~mask)); - - /* modify bits accordingly in old pkey_reg and assign it */ - new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights); - - __write_pkey_reg(new_pkey_reg); - - dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" - " pkey_reg now: %016llx old_pkey_reg: %016llx\n", - __func__, pkey, rights, flags, 0, __read_pkey_reg(), - old_pkey_reg); - return 0; -} - -void pkey_disable_set(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights; - u64 orig_pkey_reg = read_pkey_reg(); - - dprintf1("START->%s(%d, 0x%x)\n", __func__, - pkey, flags); - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - pkey_assert(pkey_rights >= 0); - - pkey_rights |= flags; - - ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); - assert(!ret); - /* pkey_reg and flags have the same format */ - shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); - dprintf1("%s(%d) shadow: 0x%016llx\n", - __func__, pkey, shadow_pkey_reg); - - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkey_reg: 0x%016llx\n", - __func__, pkey, read_pkey_reg()); - if (flags) - pkey_assert(read_pkey_reg() >= orig_pkey_reg); - dprintf1("END<---%s(%d, 0x%x)\n", __func__, - pkey, flags); -} - -void pkey_disable_clear(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights = hw_pkey_get(pkey, syscall_flags); - u64 orig_pkey_reg = read_pkey_reg(); - - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - pkey_assert(pkey_rights >= 0); - - pkey_rights &= ~flags; - - ret = hw_pkey_set(pkey, pkey_rights, 0); - shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, - pkey, read_pkey_reg()); - if (flags) - assert(read_pkey_reg() <= orig_pkey_reg); -} - -void pkey_write_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); -} -void pkey_write_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_WRITE); -} -void pkey_access_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); -} -void pkey_access_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); -} - -/* Failed address bound checks: */ -#ifndef SEGV_BNDERR -# define SEGV_BNDERR 3 -#endif - -#ifndef SEGV_PKUERR -# define SEGV_PKUERR 4 -#endif - -static char *si_code_str(int si_code) -{ - if (si_code == SEGV_MAPERR) - return "SEGV_MAPERR"; - if (si_code == SEGV_ACCERR) - return "SEGV_ACCERR"; - if (si_code == SEGV_BNDERR) - return "SEGV_BNDERR"; - if (si_code == SEGV_PKUERR) - return "SEGV_PKUERR"; - return "UNKNOWN"; -} - -int pkey_faults; -int last_si_pkey = -1; -void signal_handler(int signum, siginfo_t *si, void *vucontext) -{ - ucontext_t *uctxt = vucontext; - int trapno; - unsigned long ip; - char *fpregs; -#if defined(__i386__) || defined(__x86_64__) /* arch */ - u32 *pkey_reg_ptr; - int pkey_reg_offset; -#endif /* arch */ - u64 siginfo_pkey; - u32 *si_pkey_ptr; - - dprint_in_signal = 1; - dprintf1(">>>>===============SIGSEGV============================\n"); - dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", - __func__, __LINE__, - __read_pkey_reg(), shadow_pkey_reg); - - trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; - ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; - fpregs = (char *) uctxt->uc_mcontext.fpregs; - - dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", - __func__, trapno, ip, si_code_str(si->si_code), - si->si_code); - -#if defined(__i386__) || defined(__x86_64__) /* arch */ -#ifdef __i386__ - /* - * 32-bit has some extra padding so that userspace can tell whether - * the XSTATE header is present in addition to the "legacy" FPU - * state. We just assume that it is here. - */ - fpregs += 0x70; -#endif /* i386 */ - pkey_reg_offset = pkey_reg_xstate_offset(); - pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]); - - /* - * If we got a PKEY fault, we *HAVE* to have at least one bit set in - * here. - */ - dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset()); - if (DEBUG_LEVEL > 4) - dump_mem(pkey_reg_ptr - 128, 256); - pkey_assert(*pkey_reg_ptr); -#endif /* arch */ - - dprintf1("siginfo: %p\n", si); - dprintf1(" fpregs: %p\n", fpregs); - - if ((si->si_code == SEGV_MAPERR) || - (si->si_code == SEGV_ACCERR) || - (si->si_code == SEGV_BNDERR)) { - printf("non-PK si_code, exiting...\n"); - exit(4); - } - - si_pkey_ptr = siginfo_get_pkey_ptr(si); - dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); - dump_mem((u8 *)si_pkey_ptr - 8, 24); - siginfo_pkey = *si_pkey_ptr; - pkey_assert(siginfo_pkey < NR_PKEYS); - last_si_pkey = siginfo_pkey; - - /* - * need __read_pkey_reg() version so we do not do shadow_pkey_reg - * checking - */ - dprintf1("signal pkey_reg from pkey_reg: %016llx\n", - __read_pkey_reg()); - dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey); -#if defined(__i386__) || defined(__x86_64__) /* arch */ - dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr); - *(u64 *)pkey_reg_ptr = 0x00000000; - dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n"); -#elif defined(__powerpc64__) /* arch */ - /* restore access and let the faulting instruction continue */ - pkey_access_allow(siginfo_pkey); -#endif /* arch */ - pkey_faults++; - dprintf1("<<<<==================================================\n"); - dprint_in_signal = 0; -} - -int wait_all_children(void) -{ - int status; - return waitpid(-1, &status, 0); -} - -void sig_chld(int x) -{ - dprint_in_signal = 1; - dprintf2("[%d] SIGCHLD: %d\n", getpid(), x); - dprint_in_signal = 0; -} - -void setup_sigsegv_handler(void) -{ - int r, rs; - struct sigaction newact; - struct sigaction oldact; - - /* #PF is mapped to sigsegv */ - int signum = SIGSEGV; - - newact.sa_handler = 0; - newact.sa_sigaction = signal_handler; - - /*sigset_t - signals to block while in the handler */ - /* get the old signal mask. */ - rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask); - pkey_assert(rs == 0); - - /* call sa_sigaction, not sa_handler*/ - newact.sa_flags = SA_SIGINFO; - - newact.sa_restorer = 0; /* void(*)(), obsolete */ - r = sigaction(signum, &newact, &oldact); - r = sigaction(SIGALRM, &newact, &oldact); - pkey_assert(r == 0); -} - -void setup_handlers(void) -{ - signal(SIGCHLD, &sig_chld); - setup_sigsegv_handler(); -} - -pid_t fork_lazy_child(void) -{ - pid_t forkret; - - forkret = fork(); - pkey_assert(forkret >= 0); - dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); - - if (!forkret) { - /* in the child */ - while (1) { - dprintf1("child sleeping...\n"); - sleep(30); - } - } - return forkret; -} - -int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, - unsigned long pkey) -{ - int sret; - - dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__, - ptr, size, orig_prot, pkey); - - errno = 0; - sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey); - if (errno) { - dprintf2("SYS_mprotect_key sret: %d\n", sret); - dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); - dprintf2("SYS_mprotect_key failed, errno: %d\n", errno); - if (DEBUG_LEVEL >= 2) - perror("SYS_mprotect_pkey"); - } - return sret; -} - -int sys_pkey_alloc(unsigned long flags, unsigned long init_val) -{ - int ret = syscall(SYS_pkey_alloc, flags, init_val); - dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n", - __func__, flags, init_val, ret, errno); - return ret; -} - -int alloc_pkey(void) -{ - int ret; - unsigned long init_val = 0x0; - - dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", - __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); - ret = sys_pkey_alloc(0, init_val); - /* - * pkey_alloc() sets PKEY register, so we need to reflect it in - * shadow_pkey_reg: - */ - dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, __LINE__, ret, __read_pkey_reg(), - shadow_pkey_reg); - if (ret > 0) { - /* clear both the bits: */ - shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, - ~PKEY_MASK); - dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, - __LINE__, ret, __read_pkey_reg(), - shadow_pkey_reg); - /* - * move the new state in from init_val - * (remember, we cheated and init_val == pkey_reg format) - */ - shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, - init_val); - } - dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, __LINE__, ret, __read_pkey_reg(), - shadow_pkey_reg); - dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno); - /* for shadow checking: */ - read_pkey_reg(); - dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, __LINE__, ret, __read_pkey_reg(), - shadow_pkey_reg); - return ret; -} - -int sys_pkey_free(unsigned long pkey) -{ - int ret = syscall(SYS_pkey_free, pkey); - dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret); - return ret; -} - -/* - * I had a bug where pkey bits could be set by mprotect() but - * not cleared. This ensures we get lots of random bit sets - * and clears on the vma and pte pkey bits. - */ -int alloc_random_pkey(void) -{ - int max_nr_pkey_allocs; - int ret; - int i; - int alloced_pkeys[NR_PKEYS]; - int nr_alloced = 0; - int random_index; - memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); - - /* allocate every possible key and make a note of which ones we got */ - max_nr_pkey_allocs = NR_PKEYS; - for (i = 0; i < max_nr_pkey_allocs; i++) { - int new_pkey = alloc_pkey(); - if (new_pkey < 0) - break; - alloced_pkeys[nr_alloced++] = new_pkey; - } - - pkey_assert(nr_alloced > 0); - /* select a random one out of the allocated ones */ - random_index = rand() % nr_alloced; - ret = alloced_pkeys[random_index]; - /* now zero it out so we don't free it next */ - alloced_pkeys[random_index] = 0; - - /* go through the allocated ones that we did not want and free them */ - for (i = 0; i < nr_alloced; i++) { - int free_ret; - if (!alloced_pkeys[i]) - continue; - free_ret = sys_pkey_free(alloced_pkeys[i]); - pkey_assert(!free_ret); - } - dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", __func__, - __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); - return ret; -} - -int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, - unsigned long pkey) -{ - int nr_iterations = random() % 100; - int ret; - - while (0) { - int rpkey = alloc_random_pkey(); - ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); - dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", - ptr, size, orig_prot, pkey, ret); - if (nr_iterations-- < 0) - break; - - dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, __LINE__, ret, __read_pkey_reg(), - shadow_pkey_reg); - sys_pkey_free(rpkey); - dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, __LINE__, ret, __read_pkey_reg(), - shadow_pkey_reg); - } - pkey_assert(pkey < NR_PKEYS); - - ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); - dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", - ptr, size, orig_prot, pkey, ret); - pkey_assert(!ret); - dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", __func__, - __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); - return ret; -} - -struct pkey_malloc_record { - void *ptr; - long size; - int prot; -}; -struct pkey_malloc_record *pkey_malloc_records; -struct pkey_malloc_record *pkey_last_malloc_record; -long nr_pkey_malloc_records; -void record_pkey_malloc(void *ptr, long size, int prot) -{ - long i; - struct pkey_malloc_record *rec = NULL; - - for (i = 0; i < nr_pkey_malloc_records; i++) { - rec = &pkey_malloc_records[i]; - /* find a free record */ - if (rec) - break; - } - if (!rec) { - /* every record is full */ - size_t old_nr_records = nr_pkey_malloc_records; - size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1); - size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record); - dprintf2("new_nr_records: %zd\n", new_nr_records); - dprintf2("new_size: %zd\n", new_size); - pkey_malloc_records = realloc(pkey_malloc_records, new_size); - pkey_assert(pkey_malloc_records != NULL); - rec = &pkey_malloc_records[nr_pkey_malloc_records]; - /* - * realloc() does not initialize memory, so zero it from - * the first new record all the way to the end. - */ - for (i = 0; i < new_nr_records - old_nr_records; i++) - memset(rec + i, 0, sizeof(*rec)); - } - dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n", - (int)(rec - pkey_malloc_records), rec, ptr, size); - rec->ptr = ptr; - rec->size = size; - rec->prot = prot; - pkey_last_malloc_record = rec; - nr_pkey_malloc_records++; -} - -void free_pkey_malloc(void *ptr) -{ - long i; - int ret; - dprintf3("%s(%p)\n", __func__, ptr); - for (i = 0; i < nr_pkey_malloc_records; i++) { - struct pkey_malloc_record *rec = &pkey_malloc_records[i]; - dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n", - ptr, i, rec, rec->ptr, rec->size); - if ((ptr < rec->ptr) || - (ptr >= rec->ptr + rec->size)) - continue; - - dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n", - ptr, i, rec, rec->ptr, rec->size); - nr_pkey_malloc_records--; - ret = munmap(rec->ptr, rec->size); - dprintf3("munmap ret: %d\n", ret); - pkey_assert(!ret); - dprintf3("clearing rec->ptr, rec: %p\n", rec); - rec->ptr = NULL; - dprintf3("done clearing rec->ptr, rec: %p\n", rec); - return; - } - pkey_assert(false); -} - - -void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) -{ - void *ptr; - int ret; - - read_pkey_reg(); - dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, - size, prot, pkey); - pkey_assert(pkey < NR_PKEYS); - ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - pkey_assert(ptr != (void *)-1); - ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); - pkey_assert(!ret); - record_pkey_malloc(ptr, size, prot); - read_pkey_reg(); - - dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); - return ptr; -} - -void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) -{ - int ret; - void *ptr; - - dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, - size, prot, pkey); - /* - * Guarantee we can fit at least one huge page in the resulting - * allocation by allocating space for 2: - */ - size = ALIGN_UP(size, HPAGE_SIZE * 2); - ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - pkey_assert(ptr != (void *)-1); - record_pkey_malloc(ptr, size, prot); - mprotect_pkey(ptr, size, prot, pkey); - - dprintf1("unaligned ptr: %p\n", ptr); - ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE); - dprintf1(" aligned ptr: %p\n", ptr); - ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE); - dprintf1("MADV_HUGEPAGE ret: %d\n", ret); - ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED); - dprintf1("MADV_WILLNEED ret: %d\n", ret); - memset(ptr, 0, HPAGE_SIZE); - - dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr); - return ptr; -} - -int hugetlb_setup_ok; -#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" -#define GET_NR_HUGE_PAGES 10 -void setup_hugetlbfs(void) -{ - int err; - int fd; - char buf[256]; - long hpagesz_kb; - long hpagesz_mb; - - if (geteuid() != 0) { - fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); - return; - } - - cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages"); - - /* - * Now go make sure that we got the pages and that they - * are PMD-level pages. Someone might have made PUD-level - * pages the default. - */ - hpagesz_kb = HPAGE_SIZE / 1024; - hpagesz_mb = hpagesz_kb / 1024; - sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb); - fd = open(buf, O_RDONLY); - if (fd < 0) { - fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n", - hpagesz_mb, strerror(errno)); - return; - } - - /* -1 to guarantee leaving the trailing \0 */ - err = read(fd, buf, sizeof(buf)-1); - close(fd); - if (err <= 0) { - fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n", - hpagesz_mb, strerror(errno)); - return; - } - - if (atoi(buf) != GET_NR_HUGE_PAGES) { - fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n", - hpagesz_mb, buf, GET_NR_HUGE_PAGES); - return; - } - - hugetlb_setup_ok = 1; -} - -void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) -{ - void *ptr; - int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB; - - if (!hugetlb_setup_ok) - return PTR_ERR_ENOTSUP; - - dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey); - size = ALIGN_UP(size, HPAGE_SIZE * 2); - pkey_assert(pkey < NR_PKEYS); - ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0); - pkey_assert(ptr != (void *)-1); - mprotect_pkey(ptr, size, prot, pkey); - - record_pkey_malloc(ptr, size, prot); - - dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr); - return ptr; -} - -void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) -{ - void *ptr; - int fd; - - dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, - size, prot, pkey); - pkey_assert(pkey < NR_PKEYS); - fd = open("/dax/foo", O_RDWR); - pkey_assert(fd >= 0); - - ptr = mmap(0, size, prot, MAP_SHARED, fd, 0); - pkey_assert(ptr != (void *)-1); - - mprotect_pkey(ptr, size, prot, pkey); - - record_pkey_malloc(ptr, size, prot); - - dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr); - close(fd); - return ptr; -} - -void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { - - malloc_pkey_with_mprotect, - malloc_pkey_with_mprotect_subpage, - malloc_pkey_anon_huge, - malloc_pkey_hugetlb -/* can not do direct with the pkey_mprotect() API: - malloc_pkey_mmap_direct, - malloc_pkey_mmap_dax, -*/ -}; - -void *malloc_pkey(long size, int prot, u16 pkey) -{ - void *ret; - static int malloc_type; - int nr_malloc_types = ARRAY_SIZE(pkey_malloc); - - pkey_assert(pkey < NR_PKEYS); - - while (1) { - pkey_assert(malloc_type < nr_malloc_types); - - ret = pkey_malloc[malloc_type](size, prot, pkey); - pkey_assert(ret != (void *)-1); - - malloc_type++; - if (malloc_type >= nr_malloc_types) - malloc_type = (random()%nr_malloc_types); - - /* try again if the malloc_type we tried is unsupported */ - if (ret == PTR_ERR_ENOTSUP) - continue; - - break; - } - - dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__, - size, prot, pkey, ret); - return ret; -} - -int last_pkey_faults; -#define UNKNOWN_PKEY -2 -void expected_pkey_fault(int pkey) -{ - dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n", - __func__, last_pkey_faults, pkey_faults); - dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); - pkey_assert(last_pkey_faults + 1 == pkey_faults); - - /* - * For exec-only memory, we do not know the pkey in - * advance, so skip this check. - */ - if (pkey != UNKNOWN_PKEY) - pkey_assert(last_si_pkey == pkey); - -#if defined(__i386__) || defined(__x86_64__) /* arch */ - /* - * The signal handler shold have cleared out PKEY register to let the - * test program continue. We now have to restore it. - */ - if (__read_pkey_reg() != 0) -#else /* arch */ - if (__read_pkey_reg() != shadow_pkey_reg) -#endif /* arch */ - pkey_assert(0); - - __write_pkey_reg(shadow_pkey_reg); - dprintf1("%s() set pkey_reg=%016llx to restore state after signal " - "nuked it\n", __func__, shadow_pkey_reg); - last_pkey_faults = pkey_faults; - last_si_pkey = -1; -} - -#define do_not_expect_pkey_fault(msg) do { \ - if (last_pkey_faults != pkey_faults) \ - dprintf0("unexpected PKey fault: %s\n", msg); \ - pkey_assert(last_pkey_faults == pkey_faults); \ -} while (0) - -int test_fds[10] = { -1 }; -int nr_test_fds; -void __save_test_fd(int fd) -{ - pkey_assert(fd >= 0); - pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds)); - test_fds[nr_test_fds] = fd; - nr_test_fds++; -} - -int get_test_read_fd(void) -{ - int test_fd = open("/etc/passwd", O_RDONLY); - __save_test_fd(test_fd); - return test_fd; -} - -void close_test_fds(void) -{ - int i; - - for (i = 0; i < nr_test_fds; i++) { - if (test_fds[i] < 0) - continue; - close(test_fds[i]); - test_fds[i] = -1; - } - nr_test_fds = 0; -} - -#define barrier() __asm__ __volatile__("": : :"memory") -__attribute__((noinline)) int read_ptr(int *ptr) -{ - /* - * Keep GCC from optimizing this away somehow - */ - barrier(); - return *ptr; -} - -void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) -{ - int i, err; - int max_nr_pkey_allocs; - int alloced_pkeys[NR_PKEYS]; - int nr_alloced = 0; - long size; - - pkey_assert(pkey_last_malloc_record); - size = pkey_last_malloc_record->size; - /* - * This is a bit of a hack. But mprotect() requires - * huge-page-aligned sizes when operating on hugetlbfs. - * So, make sure that we use something that's a multiple - * of a huge page when we can. - */ - if (size >= HPAGE_SIZE) - size = HPAGE_SIZE; - - /* allocate every possible key and make sure key-0 never got allocated */ - max_nr_pkey_allocs = NR_PKEYS; - for (i = 0; i < max_nr_pkey_allocs; i++) { - int new_pkey = alloc_pkey(); - pkey_assert(new_pkey != 0); - - if (new_pkey < 0) - break; - alloced_pkeys[nr_alloced++] = new_pkey; - } - /* free all the allocated keys */ - for (i = 0; i < nr_alloced; i++) { - int free_ret; - - if (!alloced_pkeys[i]) - continue; - free_ret = sys_pkey_free(alloced_pkeys[i]); - pkey_assert(!free_ret); - } - - /* attach key-0 in various modes */ - err = sys_mprotect_pkey(ptr, size, PROT_READ, 0); - pkey_assert(!err); - err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0); - pkey_assert(!err); - err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0); - pkey_assert(!err); - err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0); - pkey_assert(!err); - err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0); - pkey_assert(!err); -} - -void test_read_of_write_disabled_region(int *ptr, u16 pkey) -{ - int ptr_contents; - - dprintf1("disabling write access to PKEY[1], doing read\n"); - pkey_write_deny(pkey); - ptr_contents = read_ptr(ptr); - dprintf1("*ptr: %d\n", ptr_contents); - dprintf1("\n"); -} -void test_read_of_access_disabled_region(int *ptr, u16 pkey) -{ - int ptr_contents; - - dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); - read_pkey_reg(); - pkey_access_deny(pkey); - ptr_contents = read_ptr(ptr); - dprintf1("*ptr: %d\n", ptr_contents); - expected_pkey_fault(pkey); -} - -void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, - u16 pkey) -{ - int ptr_contents; - - dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", - pkey, ptr); - ptr_contents = read_ptr(ptr); - dprintf1("reading ptr before disabling the read : %d\n", - ptr_contents); - read_pkey_reg(); - pkey_access_deny(pkey); - ptr_contents = read_ptr(ptr); - dprintf1("*ptr: %d\n", ptr_contents); - expected_pkey_fault(pkey); -} - -void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, - u16 pkey) -{ - *ptr = __LINE__; - dprintf1("disabling write access; after accessing the page, " - "to PKEY[%02d], doing write\n", pkey); - pkey_write_deny(pkey); - *ptr = __LINE__; - expected_pkey_fault(pkey); -} - -void test_write_of_write_disabled_region(int *ptr, u16 pkey) -{ - dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); - pkey_write_deny(pkey); - *ptr = __LINE__; - expected_pkey_fault(pkey); -} -void test_write_of_access_disabled_region(int *ptr, u16 pkey) -{ - dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); - pkey_access_deny(pkey); - *ptr = __LINE__; - expected_pkey_fault(pkey); -} - -void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, - u16 pkey) -{ - *ptr = __LINE__; - dprintf1("disabling access; after accessing the page, " - " to PKEY[%02d], doing write\n", pkey); - pkey_access_deny(pkey); - *ptr = __LINE__; - expected_pkey_fault(pkey); -} - -void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) -{ - int ret; - int test_fd = get_test_read_fd(); - - dprintf1("disabling access to PKEY[%02d], " - "having kernel read() to buffer\n", pkey); - pkey_access_deny(pkey); - ret = read(test_fd, ptr, 1); - dprintf1("read ret: %d\n", ret); - pkey_assert(ret); -} -void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) -{ - int ret; - int test_fd = get_test_read_fd(); - - pkey_write_deny(pkey); - ret = read(test_fd, ptr, 100); - dprintf1("read ret: %d\n", ret); - if (ret < 0 && (DEBUG_LEVEL > 0)) - perror("verbose read result (OK for this to be bad)"); - pkey_assert(ret); -} - -void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) -{ - int pipe_ret, vmsplice_ret; - struct iovec iov; - int pipe_fds[2]; - - pipe_ret = pipe(pipe_fds); - - pkey_assert(pipe_ret == 0); - dprintf1("disabling access to PKEY[%02d], " - "having kernel vmsplice from buffer\n", pkey); - pkey_access_deny(pkey); - iov.iov_base = ptr; - iov.iov_len = PAGE_SIZE; - vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT); - dprintf1("vmsplice() ret: %d\n", vmsplice_ret); - pkey_assert(vmsplice_ret == -1); - - close(pipe_fds[0]); - close(pipe_fds[1]); -} - -void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) -{ - int ignored = 0xdada; - int futex_ret; - int some_int = __LINE__; - - dprintf1("disabling write to PKEY[%02d], " - "doing futex gunk in buffer\n", pkey); - *ptr = some_int; - pkey_write_deny(pkey); - futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL, - &ignored, ignored); - if (DEBUG_LEVEL > 0) - perror("futex"); - dprintf1("futex() ret: %d\n", futex_ret); -} - -/* Assumes that all pkeys other than 'pkey' are unallocated */ -void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) -{ - int err; - int i; - - /* Note: 0 is the default pkey, so don't mess with it */ - for (i = 1; i < NR_PKEYS; i++) { - if (pkey == i) - continue; - - dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i); - err = sys_pkey_free(i); - pkey_assert(err); - - err = sys_pkey_free(i); - pkey_assert(err); - - err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i); - pkey_assert(err); - } -} - -/* Assumes that all pkeys other than 'pkey' are unallocated */ -void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) -{ - int err; - int bad_pkey = NR_PKEYS+99; - - /* pass a known-invalid pkey in: */ - err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey); - pkey_assert(err); -} - -void become_child(void) -{ - pid_t forkret; - - forkret = fork(); - pkey_assert(forkret >= 0); - dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); - - if (!forkret) { - /* in the child */ - return; - } - exit(0); -} - -/* Assumes that all pkeys other than 'pkey' are unallocated */ -void test_pkey_alloc_exhaust(int *ptr, u16 pkey) -{ - int err; - int allocated_pkeys[NR_PKEYS] = {0}; - int nr_allocated_pkeys = 0; - int i; - - for (i = 0; i < NR_PKEYS*3; i++) { - int new_pkey; - dprintf1("%s() alloc loop: %d\n", __func__, i); - new_pkey = alloc_pkey(); - dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, __LINE__, err, __read_pkey_reg(), - shadow_pkey_reg); - read_pkey_reg(); /* for shadow checking */ - dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); - if ((new_pkey == -1) && (errno == ENOSPC)) { - dprintf2("%s() failed to allocate pkey after %d tries\n", - __func__, nr_allocated_pkeys); - } else { - /* - * Ensure the number of successes never - * exceeds the number of keys supported - * in the hardware. - */ - pkey_assert(nr_allocated_pkeys < NR_PKEYS); - allocated_pkeys[nr_allocated_pkeys++] = new_pkey; - } - - /* - * Make sure that allocation state is properly - * preserved across fork(). - */ - if (i == NR_PKEYS*2) - become_child(); - } - - dprintf3("%s()::%d\n", __func__, __LINE__); - - /* - * On x86: - * There are 16 pkeys supported in hardware. Three are - * allocated by the time we get here: - * 1. The default key (0) - * 2. One possibly consumed by an execute-only mapping. - * 3. One allocated by the test code and passed in via - * 'pkey' to this function. - * Ensure that we can allocate at least another 13 (16-3). - * - * On powerpc: - * There are either 5, 28, 29 or 32 pkeys supported in - * hardware depending on the page size (4K or 64K) and - * platform (powernv or powervm). Four are allocated by - * the time we get here. These include pkey-0, pkey-1, - * exec-only pkey and the one allocated by the test code. - * Ensure that we can allocate the remaining. - */ - pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1)); - - for (i = 0; i < nr_allocated_pkeys; i++) { - err = sys_pkey_free(allocated_pkeys[i]); - pkey_assert(!err); - read_pkey_reg(); /* for shadow checking */ - } -} - -void arch_force_pkey_reg_init(void) -{ -#if defined(__i386__) || defined(__x86_64__) /* arch */ - u64 *buf; - - /* - * All keys should be allocated and set to allow reads and - * writes, so the register should be all 0. If not, just - * skip the test. - */ - if (read_pkey_reg()) - return; - - /* - * Just allocate an absurd about of memory rather than - * doing the XSAVE size enumeration dance. - */ - buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - - /* These __builtins require compiling with -mxsave */ - - /* XSAVE to build a valid buffer: */ - __builtin_ia32_xsave(buf, XSTATE_PKEY); - /* Clear XSTATE_BV[PKRU]: */ - buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY; - /* XRSTOR will likely get PKRU back to the init state: */ - __builtin_ia32_xrstor(buf, XSTATE_PKEY); - - munmap(buf, 1*MB); -#endif -} - - -/* - * This is mostly useless on ppc for now. But it will not - * hurt anything and should give some better coverage as - * a long-running test that continually checks the pkey - * register. - */ -void test_pkey_init_state(int *ptr, u16 pkey) -{ - int err; - int allocated_pkeys[NR_PKEYS] = {0}; - int nr_allocated_pkeys = 0; - int i; - - for (i = 0; i < NR_PKEYS; i++) { - int new_pkey = alloc_pkey(); - - if (new_pkey < 0) - continue; - allocated_pkeys[nr_allocated_pkeys++] = new_pkey; - } - - dprintf3("%s()::%d\n", __func__, __LINE__); - - arch_force_pkey_reg_init(); - - /* - * Loop for a bit, hoping to get exercise the kernel - * context switch code. - */ - for (i = 0; i < 1000000; i++) - read_pkey_reg(); - - for (i = 0; i < nr_allocated_pkeys; i++) { - err = sys_pkey_free(allocated_pkeys[i]); - pkey_assert(!err); - read_pkey_reg(); /* for shadow checking */ - } -} - -/* - * pkey 0 is special. It is allocated by default, so you do not - * have to call pkey_alloc() to use it first. Make sure that it - * is usable. - */ -void test_mprotect_with_pkey_0(int *ptr, u16 pkey) -{ - long size; - int prot; - - assert(pkey_last_malloc_record); - size = pkey_last_malloc_record->size; - /* - * This is a bit of a hack. But mprotect() requires - * huge-page-aligned sizes when operating on hugetlbfs. - * So, make sure that we use something that's a multiple - * of a huge page when we can. - */ - if (size >= HPAGE_SIZE) - size = HPAGE_SIZE; - prot = pkey_last_malloc_record->prot; - - /* Use pkey 0 */ - mprotect_pkey(ptr, size, prot, 0); - - /* Make sure that we can set it back to the original pkey. */ - mprotect_pkey(ptr, size, prot, pkey); -} - -void test_ptrace_of_child(int *ptr, u16 pkey) -{ - __attribute__((__unused__)) int peek_result; - pid_t child_pid; - void *ignored = 0; - long ret; - int status; - /* - * This is the "control" for our little expermient. Make sure - * we can always access it when ptracing. - */ - int *plain_ptr_unaligned = malloc(HPAGE_SIZE); - int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE); - - /* - * Fork a child which is an exact copy of this process, of course. - * That means we can do all of our tests via ptrace() and then plain - * memory access and ensure they work differently. - */ - child_pid = fork_lazy_child(); - dprintf1("[%d] child pid: %d\n", getpid(), child_pid); - - ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored); - if (ret) - perror("attach"); - dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__); - pkey_assert(ret != -1); - ret = waitpid(child_pid, &status, WUNTRACED); - if ((ret != child_pid) || !(WIFSTOPPED(status))) { - fprintf(stderr, "weird waitpid result %ld stat %x\n", - ret, status); - pkey_assert(0); - } - dprintf2("waitpid ret: %ld\n", ret); - dprintf2("waitpid status: %d\n", status); - - pkey_access_deny(pkey); - pkey_write_deny(pkey); - - /* Write access, untested for now: - ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data); - pkey_assert(ret != -1); - dprintf1("poke at %p: %ld\n", peek_at, ret); - */ - - /* - * Try to access the pkey-protected "ptr" via ptrace: - */ - ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored); - /* expect it to work, without an error: */ - pkey_assert(ret != -1); - /* Now access from the current task, and expect an exception: */ - peek_result = read_ptr(ptr); - expected_pkey_fault(pkey); - - /* - * Try to access the NON-pkey-protected "plain_ptr" via ptrace: - */ - ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored); - /* expect it to work, without an error: */ - pkey_assert(ret != -1); - /* Now access from the current task, and expect NO exception: */ - peek_result = read_ptr(plain_ptr); - do_not_expect_pkey_fault("read plain pointer after ptrace"); - - ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); - pkey_assert(ret != -1); - - ret = kill(child_pid, SIGKILL); - pkey_assert(ret != -1); - - wait(&status); - - free(plain_ptr_unaligned); -} - -void *get_pointer_to_instructions(void) -{ - void *p1; - - p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE); - dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write); - /* lots_o_noops_around_write should be page-aligned already */ - assert(p1 == &lots_o_noops_around_write); - - /* Point 'p1' at the *second* page of the function: */ - p1 += PAGE_SIZE; - - /* - * Try to ensure we fault this in on next touch to ensure - * we get an instruction fault as opposed to a data one - */ - madvise(p1, PAGE_SIZE, MADV_DONTNEED); - - return p1; -} - -void test_executing_on_unreadable_memory(int *ptr, u16 pkey) -{ - void *p1; - int scratch; - int ptr_contents; - int ret; - - p1 = get_pointer_to_instructions(); - lots_o_noops_around_write(&scratch); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - - ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey); - pkey_assert(!ret); - pkey_access_deny(pkey); - - dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); - - /* - * Make sure this is an *instruction* fault - */ - madvise(p1, PAGE_SIZE, MADV_DONTNEED); - lots_o_noops_around_write(&scratch); - do_not_expect_pkey_fault("executing on PROT_EXEC memory"); - expect_fault_on_read_execonly_key(p1, pkey); -} - -void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) -{ - void *p1; - int scratch; - int ptr_contents; - int ret; - - dprintf1("%s() start\n", __func__); - - p1 = get_pointer_to_instructions(); - lots_o_noops_around_write(&scratch); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - - /* Use a *normal* mprotect(), not mprotect_pkey(): */ - ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); - pkey_assert(!ret); - - /* - * Reset the shadow, assuming that the above mprotect() - * correctly changed PKRU, but to an unknown value since - * the actual allocated pkey is unknown. - */ - shadow_pkey_reg = __read_pkey_reg(); - - dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); - - /* Make sure this is an *instruction* fault */ - madvise(p1, PAGE_SIZE, MADV_DONTNEED); - lots_o_noops_around_write(&scratch); - do_not_expect_pkey_fault("executing on PROT_EXEC memory"); - expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY); - - /* - * Put the memory back to non-PROT_EXEC. Should clear the - * exec-only pkey off the VMA and allow it to be readable - * again. Go to PROT_NONE first to check for a kernel bug - * that did not clear the pkey when doing PROT_NONE. - */ - ret = mprotect(p1, PAGE_SIZE, PROT_NONE); - pkey_assert(!ret); - - ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC); - pkey_assert(!ret); - ptr_contents = read_ptr(p1); - do_not_expect_pkey_fault("plain read on recently PROT_EXEC area"); -} - -#if defined(__i386__) || defined(__x86_64__) -void test_ptrace_modifies_pkru(int *ptr, u16 pkey) -{ - u32 new_pkru; - pid_t child; - int status, ret; - int pkey_offset = pkey_reg_xstate_offset(); - size_t xsave_size = cpu_max_xsave_size(); - void *xsave; - u32 *pkey_register; - u64 *xstate_bv; - struct iovec iov; - - new_pkru = ~read_pkey_reg(); - /* Don't make PROT_EXEC mappings inaccessible */ - new_pkru &= ~3; - - child = fork(); - pkey_assert(child >= 0); - dprintf3("[%d] fork() ret: %d\n", getpid(), child); - if (!child) { - ptrace(PTRACE_TRACEME, 0, 0, 0); - /* Stop and allow the tracer to modify PKRU directly */ - raise(SIGSTOP); - - /* - * need __read_pkey_reg() version so we do not do shadow_pkey_reg - * checking - */ - if (__read_pkey_reg() != new_pkru) - exit(1); - - /* Stop and allow the tracer to clear XSTATE_BV for PKRU */ - raise(SIGSTOP); - - if (__read_pkey_reg() != 0) - exit(1); - - /* Stop and allow the tracer to examine PKRU */ - raise(SIGSTOP); - - exit(0); - } - - pkey_assert(child == waitpid(child, &status, 0)); - dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); - pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); - - xsave = (void *)malloc(xsave_size); - pkey_assert(xsave > 0); - - /* Modify the PKRU register directly */ - iov.iov_base = xsave; - iov.iov_len = xsave_size; - ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - - pkey_register = (u32 *)(xsave + pkey_offset); - pkey_assert(*pkey_register == read_pkey_reg()); - - *pkey_register = new_pkru; - - ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - - /* Test that the modification is visible in ptrace before any execution */ - memset(xsave, 0xCC, xsave_size); - ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - pkey_assert(*pkey_register == new_pkru); - - /* Execute the tracee */ - ret = ptrace(PTRACE_CONT, child, 0, 0); - pkey_assert(ret == 0); - - /* Test that the tracee saw the PKRU value change */ - pkey_assert(child == waitpid(child, &status, 0)); - dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); - pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); - - /* Test that the modification is visible in ptrace after execution */ - memset(xsave, 0xCC, xsave_size); - ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - pkey_assert(*pkey_register == new_pkru); - - /* Clear the PKRU bit from XSTATE_BV */ - xstate_bv = (u64 *)(xsave + 512); - *xstate_bv &= ~(1 << 9); - - ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - - /* Test that the modification is visible in ptrace before any execution */ - memset(xsave, 0xCC, xsave_size); - ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - pkey_assert(*pkey_register == 0); - - ret = ptrace(PTRACE_CONT, child, 0, 0); - pkey_assert(ret == 0); - - /* Test that the tracee saw the PKRU value go to 0 */ - pkey_assert(child == waitpid(child, &status, 0)); - dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); - pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); - - /* Test that the modification is visible in ptrace after execution */ - memset(xsave, 0xCC, xsave_size); - ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - pkey_assert(*pkey_register == 0); - - ret = ptrace(PTRACE_CONT, child, 0, 0); - pkey_assert(ret == 0); - pkey_assert(child == waitpid(child, &status, 0)); - dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); - pkey_assert(WIFEXITED(status)); - pkey_assert(WEXITSTATUS(status) == 0); - free(xsave); -} -#endif - -void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) -{ - int size = PAGE_SIZE; - int sret; - - if (cpu_has_pkeys()) { - dprintf1("SKIP: %s: no CPU support\n", __func__); - return; - } - - sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey); - pkey_assert(sret < 0); -} - -void (*pkey_tests[])(int *ptr, u16 pkey) = { - test_read_of_write_disabled_region, - test_read_of_access_disabled_region, - test_read_of_access_disabled_region_with_page_already_mapped, - test_write_of_write_disabled_region, - test_write_of_write_disabled_region_with_page_already_mapped, - test_write_of_access_disabled_region, - test_write_of_access_disabled_region_with_page_already_mapped, - test_kernel_write_of_access_disabled_region, - test_kernel_write_of_write_disabled_region, - test_kernel_gup_of_access_disabled_region, - test_kernel_gup_write_to_write_disabled_region, - test_executing_on_unreadable_memory, - test_implicit_mprotect_exec_only_memory, - test_mprotect_with_pkey_0, - test_ptrace_of_child, - test_pkey_init_state, - test_pkey_syscalls_on_non_allocated_pkey, - test_pkey_syscalls_bad_args, - test_pkey_alloc_exhaust, - test_pkey_alloc_free_attach_pkey0, -#if defined(__i386__) || defined(__x86_64__) - test_ptrace_modifies_pkru, -#endif -}; - -void run_tests_once(void) -{ - int *ptr; - int prot = PROT_READ|PROT_WRITE; - - for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { - int pkey; - int orig_pkey_faults = pkey_faults; - - dprintf1("======================\n"); - dprintf1("test %d preparing...\n", test_nr); - - tracing_on(); - pkey = alloc_random_pkey(); - dprintf1("test %d starting with pkey: %d\n", test_nr, pkey); - ptr = malloc_pkey(PAGE_SIZE, prot, pkey); - dprintf1("test %d starting...\n", test_nr); - pkey_tests[test_nr](ptr, pkey); - dprintf1("freeing test memory: %p\n", ptr); - free_pkey_malloc(ptr); - sys_pkey_free(pkey); - - dprintf1("pkey_faults: %d\n", pkey_faults); - dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults); - - tracing_off(); - close_test_fds(); - - printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr); - dprintf1("======================\n\n"); - } - iteration_nr++; -} - -void pkey_setup_shadow(void) -{ - shadow_pkey_reg = __read_pkey_reg(); -} - -int main(void) -{ - int nr_iterations = 22; - int pkeys_supported = is_pkeys_supported(); - - srand((unsigned int)time(NULL)); - - setup_handlers(); - - printf("has pkeys: %d\n", pkeys_supported); - - if (!pkeys_supported) { - int size = PAGE_SIZE; - int *ptr; - - printf("running PKEY tests for unsupported CPU/OS\n"); - - ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - assert(ptr != (void *)-1); - test_mprotect_pkey_on_unsupported_cpu(ptr, 1); - exit(0); - } - - pkey_setup_shadow(); - printf("startup pkey_reg: %016llx\n", read_pkey_reg()); - setup_hugetlbfs(); - - while (nr_iterations-- > 0) - run_tests_once(); - - printf("done (all tests OK)\n"); - return 0; -} diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh deleted file mode 100755 index 8984e0bb58c7..000000000000 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ /dev/null @@ -1,274 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# Please run as root - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -exitcode=0 - -usage() { - cat <"] - -t: specify specific categories to tests to run - -h: display this message - -The default behavior is to run all tests. - -Alternatively, specific groups tests can be run by passing a string -to the -t argument containing one or more of the following categories -separated by spaces: -- mmap - tests for mmap(2) -- gup_test - tests for gup using gup_test interface -- userfaultfd - tests for userfaultfd(2) -- compaction - a test for the patch "Allow compaction of unevictable pages" -- mlock - tests for mlock(2) -- mremap - tests for mremap(2) -- hugevm - tests for very large virtual address space -- vmalloc - vmalloc smoke tests -- hmm - hmm smoke tests -- madv_populate - test memadvise(2) MADV_POPULATE_{READ,WRITE} options -- memfd_secret - test memfd_secret(2) -- process_mrelease - test process_mrelease(2) -- ksm - ksm tests that do not require >=2 NUMA nodes -- ksm_numa - ksm tests that require >=2 NUMA nodes -- pkey - memory protection key tests -- soft_dirty - test soft dirty page bit semantics -- cow - test copy-on-write semantics -example: ./run_vmtests.sh -t "hmm mmap ksm" -EOF - exit 0 -} - - -while getopts "ht:" OPT; do - case ${OPT} in - "h") usage ;; - "t") VM_SELFTEST_ITEMS=${OPTARG} ;; - esac -done -shift $((OPTIND -1)) - -# default behavior: run all tests -VM_SELFTEST_ITEMS=${VM_SELFTEST_ITEMS:-default} - -test_selected() { - if [ "$VM_SELFTEST_ITEMS" == "default" ]; then - # If no VM_SELFTEST_ITEMS are specified, run all tests - return 0 - fi - # If test selected argument is one of the test items - if [[ " ${VM_SELFTEST_ITEMS[*]} " =~ " ${1} " ]]; then - return 0 - else - return 1 - fi -} - -# get huge pagesize and freepages from /proc/meminfo -while read -r name size unit; do - if [ "$name" = "HugePages_Free:" ]; then - freepgs="$size" - fi - if [ "$name" = "Hugepagesize:" ]; then - hpgsize_KB="$size" - fi -done < /proc/meminfo - -# Simple hugetlbfs tests have a hardcoded minimum requirement of -# huge pages totaling 256MB (262144KB) in size. The userfaultfd -# hugetlb test requires a minimum of 2 * nr_cpus huge pages. Take -# both of these requirements into account and attempt to increase -# number of huge pages available. -nr_cpus=$(nproc) -hpgsize_MB=$((hpgsize_KB / 1024)) -half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128)) -needmem_KB=$((half_ufd_size_MB * 2 * 1024)) - -# set proper nr_hugepages -if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then - nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) - needpgs=$((needmem_KB / hpgsize_KB)) - tries=2 - while [ "$tries" -gt 0 ] && [ "$freepgs" -lt "$needpgs" ]; do - lackpgs=$((needpgs - freepgs)) - echo 3 > /proc/sys/vm/drop_caches - if ! echo $((lackpgs + nr_hugepgs)) > /proc/sys/vm/nr_hugepages; then - echo "Please run this test as root" - exit $ksft_skip - fi - while read -r name size unit; do - if [ "$name" = "HugePages_Free:" ]; then - freepgs=$size - fi - done < /proc/meminfo - tries=$((tries - 1)) - done - if [ "$freepgs" -lt "$needpgs" ]; then - printf "Not enough huge pages available (%d < %d)\n" \ - "$freepgs" "$needpgs" - exit 1 - fi -else - echo "no hugetlbfs support in kernel?" - exit 1 -fi - -# filter 64bit architectures -ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64" -if [ -z "$ARCH" ]; then - ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/') -fi -VADDR64=0 -echo "$ARCH64STR" | grep "$ARCH" &>/dev/null && VADDR64=1 - -# Usage: run_test [test binary] [arbitrary test arguments...] -run_test() { - if test_selected ${CATEGORY}; then - local title="running $*" - local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -) - printf "%s\n%s\n%s\n" "$sep" "$title" "$sep" - - "$@" - local ret=$? - if [ $ret -eq 0 ]; then - echo "[PASS]" - elif [ $ret -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip - else - echo "[FAIL]" - exitcode=1 - fi - fi # test_selected -} - -CATEGORY="hugetlb" run_test ./hugepage-mmap - -shmmax=$(cat /proc/sys/kernel/shmmax) -shmall=$(cat /proc/sys/kernel/shmall) -echo 268435456 > /proc/sys/kernel/shmmax -echo 4194304 > /proc/sys/kernel/shmall -CATEGORY="hugetlb" run_test ./hugepage-shm -echo "$shmmax" > /proc/sys/kernel/shmmax -echo "$shmall" > /proc/sys/kernel/shmall - -CATEGORY="hugetlb" run_test ./map_hugetlb -CATEGORY="hugetlb" run_test ./hugepage-mremap -CATEGORY="hugetlb" run_test ./hugepage-vmemmap -CATEGORY="hugetlb" run_test ./hugetlb-madvise - -if test_selected "hugetlb"; then - echo "NOTE: These hugetlb tests provide minimal coverage. Use" - echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" - echo " hugetlb regression testing." -fi - -CATEGORY="mmap" run_test ./map_fixed_noreplace - -# get_user_pages_fast() benchmark -CATEGORY="gup_test" run_test ./gup_test -u -# pin_user_pages_fast() benchmark -CATEGORY="gup_test" run_test ./gup_test -a -# Dump pages 0, 19, and 4096, using pin_user_pages: -CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000 - -uffd_mods=("" ":dev") -for mod in "${uffd_mods[@]}"; do - CATEGORY="userfaultfd" run_test ./userfaultfd anon${mod} 20 16 - # Hugetlb tests require source and destination huge pages. Pass in half - # the size ($half_ufd_size_MB), which is used for *each*. - CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32 - CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32 - CATEGORY="userfaultfd" run_test ./userfaultfd shmem${mod} 20 16 -done - -#cleanup -echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages - -CATEGORY="compaction" run_test ./compaction_test - -CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit - -CATEGORY="mmap" run_test ./map_populate - -CATEGORY="mlock" run_test ./mlock-random-test - -CATEGORY="mlock" run_test ./mlock2-tests - -CATEGORY="process_mrelease" run_test ./mrelease_test - -CATEGORY="mremap" run_test ./mremap_test - -CATEGORY="hugetlb" run_test ./thuge-gen - -if [ $VADDR64 -ne 0 ]; then - CATEGORY="hugevm" run_test ./virtual_address_range - - # virtual address 128TB switch test - CATEGORY="hugevm" run_test ./va_128TBswitch.sh -fi # VADDR64 - -# vmalloc stability smoke test -CATEGORY="vmalloc" run_test ./test_vmalloc.sh smoke - -CATEGORY="mremap" run_test ./mremap_dontunmap - -CATEGORY="hmm" run_test ./test_hmm.sh smoke - -# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests -CATEGORY="madv_populate" run_test ./madv_populate - -CATEGORY="memfd_secret" run_test ./memfd_secret - -# KSM MADV_MERGEABLE test with 10 identical pages -CATEGORY="ksm" run_test ./ksm_tests -M -p 10 -# KSM unmerge test -CATEGORY="ksm" run_test ./ksm_tests -U -# KSM test with 10 zero pages and use_zero_pages = 0 -CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 0 -# KSM test with 10 zero pages and use_zero_pages = 1 -CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 1 -# KSM test with 2 NUMA nodes and merge_across_nodes = 1 -CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 1 -# KSM test with 2 NUMA nodes and merge_across_nodes = 0 -CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0 - -CATEGORY="ksm" run_test ./ksm_functional_tests - -run_test ./ksm_functional_tests - -# protection_keys tests -if [ -x ./protection_keys_32 ] -then - CATEGORY="pkey" run_test ./protection_keys_32 -fi - -if [ -x ./protection_keys_64 ] -then - CATEGORY="pkey" run_test ./protection_keys_64 -fi - -CATEGORY="soft_dirty" run_test ./soft-dirty - -# COW tests -CATEGORY="cow" run_test ./cow - -exit $exitcode diff --git a/tools/testing/selftests/vm/settings b/tools/testing/selftests/vm/settings deleted file mode 100644 index 9abfc60e9e6f..000000000000 --- a/tools/testing/selftests/vm/settings +++ /dev/null @@ -1 +0,0 @@ -timeout=45 diff --git a/tools/testing/selftests/vm/soft-dirty.c b/tools/testing/selftests/vm/soft-dirty.c deleted file mode 100644 index 21d8830c5f24..000000000000 --- a/tools/testing/selftests/vm/soft-dirty.c +++ /dev/null @@ -1,210 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include "../kselftest.h" -#include "vm_util.h" - -#define PAGEMAP_FILE_PATH "/proc/self/pagemap" -#define TEST_ITERATIONS 10000 - -static void test_simple(int pagemap_fd, int pagesize) -{ - int i; - char *map; - - map = aligned_alloc(pagesize, pagesize); - if (!map) - ksft_exit_fail_msg("mmap failed\n"); - - clear_softdirty(); - - for (i = 0 ; i < TEST_ITERATIONS; i++) { - if (pagemap_is_softdirty(pagemap_fd, map) == 1) { - ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i); - break; - } - - clear_softdirty(); - // Write something to the page to get the dirty bit enabled on the page - map[0]++; - - if (pagemap_is_softdirty(pagemap_fd, map) == 0) { - ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i); - break; - } - - clear_softdirty(); - } - free(map); - - ksft_test_result(i == TEST_ITERATIONS, "Test %s\n", __func__); -} - -static void test_vma_reuse(int pagemap_fd, int pagesize) -{ - char *map, *map2; - - map = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0); - if (map == MAP_FAILED) - ksft_exit_fail_msg("mmap failed"); - - // The kernel always marks new regions as soft dirty - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, - "Test %s dirty bit of allocated page\n", __func__); - - clear_softdirty(); - munmap(map, pagesize); - - map2 = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0); - if (map2 == MAP_FAILED) - ksft_exit_fail_msg("mmap failed"); - - // Dirty bit is set for new regions even if they are reused - if (map == map2) - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1, - "Test %s dirty bit of reused address page\n", __func__); - else - ksft_test_result_skip("Test %s dirty bit of reused address page\n", __func__); - - munmap(map2, pagesize); -} - -static void test_hugepage(int pagemap_fd, int pagesize) -{ - char *map; - int i, ret; - size_t hpage_len = read_pmd_pagesize(); - - map = memalign(hpage_len, hpage_len); - if (!map) - ksft_exit_fail_msg("memalign failed\n"); - - ret = madvise(map, hpage_len, MADV_HUGEPAGE); - if (ret) - ksft_exit_fail_msg("madvise failed %d\n", ret); - - for (i = 0; i < hpage_len; i++) - map[i] = (char)i; - - if (check_huge_anon(map, 1, hpage_len)) { - ksft_test_result_pass("Test %s huge page allocation\n", __func__); - - clear_softdirty(); - for (i = 0 ; i < TEST_ITERATIONS ; i++) { - if (pagemap_is_softdirty(pagemap_fd, map) == 1) { - ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i); - break; - } - - clear_softdirty(); - // Write something to the page to get the dirty bit enabled on the page - map[0]++; - - if (pagemap_is_softdirty(pagemap_fd, map) == 0) { - ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i); - break; - } - clear_softdirty(); - } - - ksft_test_result(i == TEST_ITERATIONS, "Test %s huge page dirty bit\n", __func__); - } else { - // hugepage allocation failed. skip these tests - ksft_test_result_skip("Test %s huge page allocation\n", __func__); - ksft_test_result_skip("Test %s huge page dirty bit\n", __func__); - } - free(map); -} - -static void test_mprotect(int pagemap_fd, int pagesize, bool anon) -{ - const char *type[] = {"file", "anon"}; - const char *fname = "./soft-dirty-test-file"; - int test_fd; - char *map; - - if (anon) { - map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - if (!map) - ksft_exit_fail_msg("anon mmap failed\n"); - } else { - test_fd = open(fname, O_RDWR | O_CREAT); - if (test_fd < 0) { - ksft_test_result_skip("Test %s open() file failed\n", __func__); - return; - } - unlink(fname); - ftruncate(test_fd, pagesize); - map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, - MAP_SHARED, test_fd, 0); - if (!map) - ksft_exit_fail_msg("file mmap failed\n"); - } - - *map = 1; - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, - "Test %s-%s dirty bit of new written page\n", - __func__, type[anon]); - clear_softdirty(); - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, - "Test %s-%s soft-dirty clear after clear_refs\n", - __func__, type[anon]); - mprotect(map, pagesize, PROT_READ); - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, - "Test %s-%s soft-dirty clear after marking RO\n", - __func__, type[anon]); - mprotect(map, pagesize, PROT_READ|PROT_WRITE); - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, - "Test %s-%s soft-dirty clear after marking RW\n", - __func__, type[anon]); - *map = 2; - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, - "Test %s-%s soft-dirty after rewritten\n", - __func__, type[anon]); - - munmap(map, pagesize); - - if (!anon) - close(test_fd); -} - -static void test_mprotect_anon(int pagemap_fd, int pagesize) -{ - test_mprotect(pagemap_fd, pagesize, true); -} - -static void test_mprotect_file(int pagemap_fd, int pagesize) -{ - test_mprotect(pagemap_fd, pagesize, false); -} - -int main(int argc, char **argv) -{ - int pagemap_fd; - int pagesize; - - ksft_print_header(); - ksft_set_plan(15); - - pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY); - if (pagemap_fd < 0) - ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH); - - pagesize = getpagesize(); - - test_simple(pagemap_fd, pagesize); - test_vma_reuse(pagemap_fd, pagesize); - test_hugepage(pagemap_fd, pagesize); - test_mprotect_anon(pagemap_fd, pagesize); - test_mprotect_file(pagemap_fd, pagesize); - - close(pagemap_fd); - - return ksft_exit_pass(); -} diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c deleted file mode 100644 index 76e1c36dd9e5..000000000000 --- a/tools/testing/selftests/vm/split_huge_page_test.c +++ /dev/null @@ -1,309 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual - * address range in a process via /split_huge_pages interface. - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "vm_util.h" - -uint64_t pagesize; -unsigned int pageshift; -uint64_t pmd_pagesize; - -#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages" -#define INPUT_MAX 80 - -#define PID_FMT "%d,0x%lx,0x%lx" -#define PATH_FMT "%s,0x%lx,0x%lx" - -#define PFN_MASK ((1UL<<55)-1) -#define KPF_THP (1UL<<22) - -int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file) -{ - uint64_t paddr; - uint64_t page_flags; - - if (pagemap_file) { - pread(pagemap_file, &paddr, sizeof(paddr), - ((long)vaddr >> pageshift) * sizeof(paddr)); - - if (kpageflags_file) { - pread(kpageflags_file, &page_flags, sizeof(page_flags), - (paddr & PFN_MASK) * sizeof(page_flags)); - - return !!(page_flags & KPF_THP); - } - } - return 0; -} - -static int write_file(const char *path, const char *buf, size_t buflen) -{ - int fd; - ssize_t numwritten; - - fd = open(path, O_WRONLY); - if (fd == -1) - return 0; - - numwritten = write(fd, buf, buflen - 1); - close(fd); - if (numwritten < 1) - return 0; - - return (unsigned int) numwritten; -} - -static void write_debugfs(const char *fmt, ...) -{ - char input[INPUT_MAX]; - int ret; - va_list argp; - - va_start(argp, fmt); - ret = vsnprintf(input, INPUT_MAX, fmt, argp); - va_end(argp); - - if (ret >= INPUT_MAX) { - printf("%s: Debugfs input is too long\n", __func__); - exit(EXIT_FAILURE); - } - - if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) { - perror(SPLIT_DEBUGFS); - exit(EXIT_FAILURE); - } -} - -void split_pmd_thp(void) -{ - char *one_page; - size_t len = 4 * pmd_pagesize; - size_t i; - - one_page = memalign(pmd_pagesize, len); - - if (!one_page) { - printf("Fail to allocate memory\n"); - exit(EXIT_FAILURE); - } - - madvise(one_page, len, MADV_HUGEPAGE); - - for (i = 0; i < len; i++) - one_page[i] = (char)i; - - if (!check_huge_anon(one_page, 1, pmd_pagesize)) { - printf("No THP is allocated\n"); - exit(EXIT_FAILURE); - } - - /* split all THPs */ - write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, - (uint64_t)one_page + len); - - for (i = 0; i < len; i++) - if (one_page[i] != (char)i) { - printf("%ld byte corrupted\n", i); - exit(EXIT_FAILURE); - } - - - if (check_huge_anon(one_page, 0, pmd_pagesize)) { - printf("Still AnonHugePages not split\n"); - exit(EXIT_FAILURE); - } - - printf("Split huge pages successful\n"); - free(one_page); -} - -void split_pte_mapped_thp(void) -{ - char *one_page, *pte_mapped, *pte_mapped2; - size_t len = 4 * pmd_pagesize; - uint64_t thp_size; - size_t i; - const char *pagemap_template = "/proc/%d/pagemap"; - const char *kpageflags_proc = "/proc/kpageflags"; - char pagemap_proc[255]; - int pagemap_fd; - int kpageflags_fd; - - if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) { - perror("get pagemap proc error"); - exit(EXIT_FAILURE); - } - pagemap_fd = open(pagemap_proc, O_RDONLY); - - if (pagemap_fd == -1) { - perror("read pagemap:"); - exit(EXIT_FAILURE); - } - - kpageflags_fd = open(kpageflags_proc, O_RDONLY); - - if (kpageflags_fd == -1) { - perror("read kpageflags:"); - exit(EXIT_FAILURE); - } - - one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - - madvise(one_page, len, MADV_HUGEPAGE); - - for (i = 0; i < len; i++) - one_page[i] = (char)i; - - if (!check_huge_anon(one_page, 1, pmd_pagesize)) { - printf("No THP is allocated\n"); - exit(EXIT_FAILURE); - } - - /* remap the first pagesize of first THP */ - pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE); - - /* remap the Nth pagesize of Nth THP */ - for (i = 1; i < 4; i++) { - pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i, - pagesize, pagesize, - MREMAP_MAYMOVE|MREMAP_FIXED, - pte_mapped + pagesize * i); - if (pte_mapped2 == (char *)-1) { - perror("mremap failed"); - exit(EXIT_FAILURE); - } - } - - /* smap does not show THPs after mremap, use kpageflags instead */ - thp_size = 0; - for (i = 0; i < pagesize * 4; i++) - if (i % pagesize == 0 && - is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) - thp_size++; - - if (thp_size != 4) { - printf("Some THPs are missing during mremap\n"); - exit(EXIT_FAILURE); - } - - /* split all remapped THPs */ - write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped, - (uint64_t)pte_mapped + pagesize * 4); - - /* smap does not show THPs after mremap, use kpageflags instead */ - thp_size = 0; - for (i = 0; i < pagesize * 4; i++) { - if (pte_mapped[i] != (char)i) { - printf("%ld byte corrupted\n", i); - exit(EXIT_FAILURE); - } - if (i % pagesize == 0 && - is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) - thp_size++; - } - - if (thp_size) { - printf("Still %ld THPs not split\n", thp_size); - exit(EXIT_FAILURE); - } - - printf("Split PTE-mapped huge pages successful\n"); - munmap(one_page, len); - close(pagemap_fd); - close(kpageflags_fd); -} - -void split_file_backed_thp(void) -{ - int status; - int fd; - ssize_t num_written; - char tmpfs_template[] = "/tmp/thp_split_XXXXXX"; - const char *tmpfs_loc = mkdtemp(tmpfs_template); - char testfile[INPUT_MAX]; - uint64_t pgoff_start = 0, pgoff_end = 1024; - - printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n"); - - status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m"); - - if (status) { - printf("Unable to create a tmpfs for testing\n"); - exit(EXIT_FAILURE); - } - - status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc); - if (status >= INPUT_MAX) { - printf("Fail to create file-backed THP split testing file\n"); - goto cleanup; - } - - fd = open(testfile, O_CREAT|O_WRONLY); - if (fd == -1) { - perror("Cannot open testing file\n"); - goto cleanup; - } - - /* write something to the file, so a file-backed THP can be allocated */ - num_written = write(fd, tmpfs_loc, strlen(tmpfs_loc) + 1); - close(fd); - - if (num_written < 1) { - printf("Fail to write data to testing file\n"); - goto cleanup; - } - - /* split the file-backed THP */ - write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end); - - status = unlink(testfile); - if (status) - perror("Cannot remove testing file\n"); - -cleanup: - status = umount(tmpfs_loc); - if (status) { - printf("Unable to umount %s\n", tmpfs_loc); - exit(EXIT_FAILURE); - } - status = rmdir(tmpfs_loc); - if (status) { - perror("cannot remove tmp dir"); - exit(EXIT_FAILURE); - } - - printf("file-backed THP split test done, please check dmesg for more information\n"); -} - -int main(int argc, char **argv) -{ - if (geteuid() != 0) { - printf("Please run the benchmark as root\n"); - exit(EXIT_FAILURE); - } - - pagesize = getpagesize(); - pageshift = ffs(pagesize) - 1; - pmd_pagesize = read_pmd_pagesize(); - - split_pmd_thp(); - split_pte_mapped_thp(); - split_file_backed_thp(); - - return 0; -} diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/vm/test_hmm.sh deleted file mode 100755 index 46e19b5d648d..000000000000 --- a/tools/testing/selftests/vm/test_hmm.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Copyright (C) 2018 Uladzislau Rezki (Sony) -# -# This is a test script for the kernel test driver to analyse vmalloc -# allocator. Therefore it is just a kernel module loader. You can specify -# and pass different parameters in order to: -# a) analyse performance of vmalloc allocations; -# b) stressing and stability check of vmalloc subsystem. - -TEST_NAME="test_hmm" -DRIVER="test_hmm" - -# 1 if fails -exitcode=1 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -check_test_requirements() -{ - uid=$(id -u) - if [ $uid -ne 0 ]; then - echo "$0: Must be run as root" - exit $ksft_skip - fi - - if ! which modprobe > /dev/null 2>&1; then - echo "$0: You need modprobe installed" - exit $ksft_skip - fi - - if ! modinfo $DRIVER > /dev/null 2>&1; then - echo "$0: You must have the following enabled in your kernel:" - echo "CONFIG_TEST_HMM=m" - exit $ksft_skip - fi -} - -load_driver() -{ - if [ $# -eq 0 ]; then - modprobe $DRIVER > /dev/null 2>&1 - else - if [ $# -eq 2 ]; then - modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2 - > /dev/null 2>&1 - else - echo "Missing module parameters. Make sure pass"\ - "spm_addr_dev0 and spm_addr_dev1" - usage - fi - fi -} - -unload_driver() -{ - modprobe -r $DRIVER > /dev/null 2>&1 -} - -run_smoke() -{ - echo "Running smoke test. Note, this test provides basic coverage." - - load_driver $1 $2 - $(dirname "${BASH_SOURCE[0]}")/hmm-tests - unload_driver -} - -usage() -{ - echo -n "Usage: $0" - echo - echo "Example usage:" - echo - echo "# Shows help message" - echo "./${TEST_NAME}.sh" - echo - echo "# Smoke testing" - echo "./${TEST_NAME}.sh smoke" - echo - echo "# Smoke testing with SPM enabled" - echo "./${TEST_NAME}.sh smoke " - echo - exit 0 -} - -function run_test() -{ - if [ $# -eq 0 ]; then - usage - else - if [ "$1" = "smoke" ]; then - run_smoke $2 $3 - else - usage - fi - fi -} - -check_test_requirements -run_test $@ - -exit 0 diff --git a/tools/testing/selftests/vm/test_vmalloc.sh b/tools/testing/selftests/vm/test_vmalloc.sh deleted file mode 100755 index d73b846736f1..000000000000 --- a/tools/testing/selftests/vm/test_vmalloc.sh +++ /dev/null @@ -1,177 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Copyright (C) 2018 Uladzislau Rezki (Sony) -# -# This is a test script for the kernel test driver to analyse vmalloc -# allocator. Therefore it is just a kernel module loader. You can specify -# and pass different parameters in order to: -# a) analyse performance of vmalloc allocations; -# b) stressing and stability check of vmalloc subsystem. - -TEST_NAME="vmalloc" -DRIVER="test_${TEST_NAME}" -NUM_CPUS=`grep -c ^processor /proc/cpuinfo` - -# 1 if fails -exitcode=1 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -# -# Static templates for performance, stressing and smoke tests. -# Also it is possible to pass any supported parameters manualy. -# -PERF_PARAM="sequential_test_order=1 test_repeat_count=3" -SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10" -STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20" - -check_test_requirements() -{ - uid=$(id -u) - if [ $uid -ne 0 ]; then - echo "$0: Must be run as root" - exit $ksft_skip - fi - - if ! which modprobe > /dev/null 2>&1; then - echo "$0: You need modprobe installed" - exit $ksft_skip - fi - - if ! modinfo $DRIVER > /dev/null 2>&1; then - echo "$0: You must have the following enabled in your kernel:" - echo "CONFIG_TEST_VMALLOC=m" - exit $ksft_skip - fi -} - -run_perfformance_check() -{ - echo "Run performance tests to evaluate how fast vmalloc allocation is." - echo "It runs all test cases on one single CPU with sequential order." - - modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1 - echo "Done." - echo "Ccheck the kernel message buffer to see the summary." -} - -run_stability_check() -{ - echo "Run stability tests. In order to stress vmalloc subsystem all" - echo "available test cases are run by NUM_CPUS workers simultaneously." - echo "It will take time, so be patient." - - modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1 - echo "Done." - echo "Check the kernel ring buffer to see the summary." -} - -run_smoke_check() -{ - echo "Run smoke test. Note, this test provides basic coverage." - echo "Please check $0 output how it can be used" - echo "for deep performance analysis as well as stress testing." - - modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1 - echo "Done." - echo "Check the kernel ring buffer to see the summary." -} - -usage() -{ - echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | " - echo "manual parameters" - echo - echo "Valid tests and parameters:" - echo - modinfo $DRIVER - echo - echo "Example usage:" - echo - echo "# Shows help message" - echo "./${DRIVER}.sh" - echo - echo "# Runs 1 test(id_1), repeats it 5 times by NUM_CPUS workers" - echo "./${DRIVER}.sh nr_threads=$NUM_CPUS run_test_mask=1 test_repeat_count=5" - echo - echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with " - echo "sequential order" - echo -n "./${DRIVER}.sh sequential_test_order=1 " - echo "run_test_mask=23" - echo - echo -n "# Runs all tests by NUM_CPUS workers, shuffled order, repeats " - echo "20 times" - echo "./${DRIVER}.sh nr_threads=$NUM_CPUS test_repeat_count=20" - echo - echo "# Performance analysis" - echo "./${DRIVER}.sh performance" - echo - echo "# Stress testing" - echo "./${DRIVER}.sh stress" - echo - exit 0 -} - -function validate_passed_args() -{ - VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'` - - # - # Something has been passed, check it. - # - for passed_arg in $@; do - key=${passed_arg//=*/} - val="${passed_arg:$((${#key}+1))}" - valid=0 - - for valid_arg in $VALID_ARGS; do - if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then - valid=1 - break - fi - done - - if [[ $valid -ne 1 ]]; then - echo "Error: key or value is not correct: ${key} $val" - exit $exitcode - fi - done -} - -function run_manual_check() -{ - # - # Validate passed parameters. If there is wrong one, - # the script exists and does not execute further. - # - validate_passed_args $@ - - echo "Run the test with following parameters: $@" - modprobe $DRIVER $@ > /dev/null 2>&1 - echo "Done." - echo "Check the kernel ring buffer to see the summary." -} - -function run_test() -{ - if [ $# -eq 0 ]; then - usage - else - if [[ "$1" = "performance" ]]; then - run_perfformance_check - elif [[ "$1" = "stress" ]]; then - run_stability_check - elif [[ "$1" = "smoke" ]]; then - run_smoke_check - else - run_manual_check $@ - fi - fi -} - -check_test_requirements -run_test $@ - -exit 0 diff --git a/tools/testing/selftests/vm/thuge-gen.c b/tools/testing/selftests/vm/thuge-gen.c deleted file mode 100644 index 361ef7192cc6..000000000000 --- a/tools/testing/selftests/vm/thuge-gen.c +++ /dev/null @@ -1,257 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Test selecting other page sizes for mmap/shmget. - - Before running this huge pages for each huge page size must have been - reserved. - For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used. - Also shmmax must be increased. - And you need to run as root to work around some weird permissions in shm. - And nothing using huge pages should run in parallel. - When the program aborts you may need to clean up the shm segments with - ipcrm -m by hand, like this - sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m - (warning this will remove all if someone else uses them) */ - -#define _GNU_SOURCE 1 -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define err(x) perror(x), exit(1) - -#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) -#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) -#define MAP_HUGE_SHIFT 26 -#define MAP_HUGE_MASK 0x3f -#if !defined(MAP_HUGETLB) -#define MAP_HUGETLB 0x40000 -#endif - -#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */ -#define SHM_HUGE_SHIFT 26 -#define SHM_HUGE_MASK 0x3f -#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT) -#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) - -#define NUM_PAGESIZES 5 - -#define NUM_PAGES 4 - -#define Dprintf(fmt...) // printf(fmt) - -unsigned long page_sizes[NUM_PAGESIZES]; -int num_page_sizes; - -int ilog2(unsigned long v) -{ - int l = 0; - while ((1UL << l) < v) - l++; - return l; -} - -void find_pagesizes(void) -{ - glob_t g; - int i; - glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g); - assert(g.gl_pathc <= NUM_PAGESIZES); - for (i = 0; i < g.gl_pathc; i++) { - sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB", - &page_sizes[i]); - page_sizes[i] <<= 10; - printf("Found %luMB\n", page_sizes[i] >> 20); - } - num_page_sizes = g.gl_pathc; - globfree(&g); -} - -unsigned long default_huge_page_size(void) -{ - unsigned long hps = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - if (!f) - return 0; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { - hps <<= 10; - break; - } - } - free(line); - return hps; -} - -void show(unsigned long ps) -{ - char buf[100]; - if (ps == getpagesize()) - return; - printf("%luMB: ", ps >> 20); - fflush(stdout); - snprintf(buf, sizeof buf, - "cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", - ps >> 10); - system(buf); -} - -unsigned long read_sysfs(int warn, char *fmt, ...) -{ - char *line = NULL; - size_t linelen = 0; - char buf[100]; - FILE *f; - va_list ap; - unsigned long val = 0; - - va_start(ap, fmt); - vsnprintf(buf, sizeof buf, fmt, ap); - va_end(ap); - - f = fopen(buf, "r"); - if (!f) { - if (warn) - printf("missing %s\n", buf); - return 0; - } - if (getline(&line, &linelen, f) > 0) { - sscanf(line, "%lu", &val); - } - fclose(f); - free(line); - return val; -} - -unsigned long read_free(unsigned long ps) -{ - return read_sysfs(ps != getpagesize(), - "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", - ps >> 10); -} - -void test_mmap(unsigned long size, unsigned flags) -{ - char *map; - unsigned long before, after; - int err; - - before = read_free(size); - map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE, - MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0); - - if (map == (char *)-1) err("mmap"); - memset(map, 0xff, size*NUM_PAGES); - after = read_free(size); - Dprintf("before %lu after %lu diff %ld size %lu\n", - before, after, before - after, size); - assert(size == getpagesize() || (before - after) == NUM_PAGES); - show(size); - err = munmap(map, size); - assert(!err); -} - -void test_shmget(unsigned long size, unsigned flags) -{ - int id; - unsigned long before, after; - int err; - - before = read_free(size); - id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags); - if (id < 0) err("shmget"); - - struct shm_info i; - if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl"); - Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss); - - - Dprintf("id %d\n", id); - char *map = shmat(id, NULL, 0600); - if (map == (char*)-1) err("shmat"); - - shmctl(id, IPC_RMID, NULL); - - memset(map, 0xff, size*NUM_PAGES); - after = read_free(size); - - Dprintf("before %lu after %lu diff %ld size %lu\n", - before, after, before - after, size); - assert(size == getpagesize() || (before - after) == NUM_PAGES); - show(size); - err = shmdt(map); - assert(!err); -} - -void sanity_checks(void) -{ - int i; - unsigned long largest = getpagesize(); - - for (i = 0; i < num_page_sizes; i++) { - if (page_sizes[i] > largest) - largest = page_sizes[i]; - - if (read_free(page_sizes[i]) < NUM_PAGES) { - printf("Not enough huge pages for page size %lu MB, need %u\n", - page_sizes[i] >> 20, - NUM_PAGES); - exit(0); - } - } - - if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) { - printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES); - exit(0); - } - -#if defined(__x86_64__) - if (largest != 1U<<30) { - printf("No GB pages available on x86-64\n" - "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES); - exit(0); - } -#endif -} - -int main(void) -{ - int i; - unsigned default_hps = default_huge_page_size(); - - find_pagesizes(); - - sanity_checks(); - - for (i = 0; i < num_page_sizes; i++) { - unsigned long ps = page_sizes[i]; - int arg = ilog2(ps) << MAP_HUGE_SHIFT; - printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg); - test_mmap(ps, MAP_HUGETLB | arg); - } - printf("Testing default huge mmap\n"); - test_mmap(default_hps, SHM_HUGETLB); - - puts("Testing non-huge shmget"); - test_shmget(getpagesize(), 0); - - for (i = 0; i < num_page_sizes; i++) { - unsigned long ps = page_sizes[i]; - int arg = ilog2(ps) << SHM_HUGE_SHIFT; - printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg); - test_shmget(ps, SHM_HUGETLB | arg); - } - puts("default huge shmget"); - test_shmget(default_hps, SHM_HUGETLB); - - return 0; -} diff --git a/tools/testing/selftests/vm/transhuge-stress.c b/tools/testing/selftests/vm/transhuge-stress.c deleted file mode 100644 index e3f00adb1b82..000000000000 --- a/tools/testing/selftests/vm/transhuge-stress.c +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Stress test for transparent huge pages, memory compaction and migration. - * - * Authors: Konstantin Khlebnikov - * - * This is free and unencumbered software released into the public domain. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "util.h" - -int backing_fd = -1; -int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE; -#define PROT_RW (PROT_READ | PROT_WRITE) - -int main(int argc, char **argv) -{ - size_t ram, len; - void *ptr, *p; - struct timespec a, b; - int i = 0; - char *name = NULL; - double s; - uint8_t *map; - size_t map_len; - int pagemap_fd; - - ram = sysconf(_SC_PHYS_PAGES); - if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4) - ram = SIZE_MAX / 4; - else - ram *= sysconf(_SC_PAGESIZE); - len = ram; - - while (++i < argc) { - if (!strcmp(argv[i], "-h")) - errx(1, "usage: %s [size in MiB]", argv[0]); - else if (!strcmp(argv[i], "-f")) - name = argv[++i]; - else - len = atoll(argv[i]) << 20; - } - - if (name) { - backing_fd = open(name, O_RDWR); - if (backing_fd == -1) - errx(2, "open %s", name); - mmap_flags = MAP_SHARED; - } - - warnx("allocate %zd transhuge pages, using %zd MiB virtual memory" - " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20, - ram >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1)); - - pagemap_fd = open("/proc/self/pagemap", O_RDONLY); - if (pagemap_fd < 0) - err(2, "open pagemap"); - - len -= len % HPAGE_SIZE; - ptr = mmap(NULL, len + HPAGE_SIZE, PROT_RW, mmap_flags, backing_fd, 0); - if (ptr == MAP_FAILED) - err(2, "initial mmap"); - ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE; - - if (madvise(ptr, len, MADV_HUGEPAGE)) - err(2, "MADV_HUGEPAGE"); - - map_len = ram >> (HPAGE_SHIFT - 1); - map = malloc(map_len); - if (!map) - errx(2, "map malloc"); - - while (1) { - int nr_succeed = 0, nr_failed = 0, nr_pages = 0; - - memset(map, 0, map_len); - - clock_gettime(CLOCK_MONOTONIC, &a); - for (p = ptr; p < ptr + len; p += HPAGE_SIZE) { - int64_t pfn; - - pfn = allocate_transhuge(p, pagemap_fd); - - if (pfn < 0) { - nr_failed++; - } else { - size_t idx = pfn >> (HPAGE_SHIFT - PAGE_SHIFT); - - nr_succeed++; - if (idx >= map_len) { - map = realloc(map, idx + 1); - if (!map) - errx(2, "map realloc"); - memset(map + map_len, 0, idx + 1 - map_len); - map_len = idx + 1; - } - if (!map[idx]) - nr_pages++; - map[idx] = 1; - } - - /* split transhuge page, keep last page */ - if (madvise(p, HPAGE_SIZE - PAGE_SIZE, MADV_DONTNEED)) - err(2, "MADV_DONTNEED"); - } - clock_gettime(CLOCK_MONOTONIC, &b); - s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.; - - warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t" - "%4d succeed, %4d failed, %4d different pages", - s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20), - nr_succeed, nr_failed, nr_pages); - } -} diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c deleted file mode 100644 index 7f22844ed704..000000000000 --- a/tools/testing/selftests/vm/userfaultfd.c +++ /dev/null @@ -1,1858 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Stress userfaultfd syscall. - * - * Copyright (C) 2015 Red Hat, Inc. - * - * This test allocates two virtual areas and bounces the physical - * memory across the two virtual areas (from area_src to area_dst) - * using userfaultfd. - * - * There are three threads running per CPU: - * - * 1) one per-CPU thread takes a per-page pthread_mutex in a random - * page of the area_dst (while the physical page may still be in - * area_src), and increments a per-page counter in the same page, - * and checks its value against a verification region. - * - * 2) another per-CPU thread handles the userfaults generated by - * thread 1 above. userfaultfd blocking reads or poll() modes are - * exercised interleaved. - * - * 3) one last per-CPU thread transfers the memory in the background - * at maximum bandwidth (if not already transferred by thread - * 2). Each cpu thread takes cares of transferring a portion of the - * area. - * - * When all threads of type 3 completed the transfer, one bounce is - * complete. area_src and area_dst are then swapped. All threads are - * respawned and so the bounce is immediately restarted in the - * opposite direction. - * - * per-CPU threads 1 by triggering userfaults inside - * pthread_mutex_lock will also verify the atomicity of the memory - * transfer (UFFDIO_COPY). - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" -#include "vm_util.h" - -#ifdef __NR_userfaultfd - -static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; - -#define BOUNCE_RANDOM (1<<0) -#define BOUNCE_RACINGFAULTS (1<<1) -#define BOUNCE_VERIFY (1<<2) -#define BOUNCE_POLL (1<<3) -static int bounces; - -#define TEST_ANON 1 -#define TEST_HUGETLB 2 -#define TEST_SHMEM 3 -static int test_type; - -#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) - -#define BASE_PMD_ADDR ((void *)(1UL << 30)) - -/* test using /dev/userfaultfd, instead of userfaultfd(2) */ -static bool test_dev_userfaultfd; - -/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ -#define ALARM_INTERVAL_SECS 10 -static volatile bool test_uffdio_copy_eexist = true; -static volatile bool test_uffdio_zeropage_eexist = true; -/* Whether to test uffd write-protection */ -static bool test_uffdio_wp = true; -/* Whether to test uffd minor faults */ -static bool test_uffdio_minor = false; -static bool map_shared; -static int mem_fd; -static unsigned long long *count_verify; -static int uffd = -1; -static int uffd_flags, finished, *pipefd; -static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; -static char *zeropage; -pthread_attr_t attr; -static bool test_collapse; - -/* Userfaultfd test statistics */ -struct uffd_stats { - int cpu; - unsigned long missing_faults; - unsigned long wp_faults; - unsigned long minor_faults; -}; - -/* pthread_mutex_t starts at page offset 0 */ -#define area_mutex(___area, ___nr) \ - ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) -/* - * count is placed in the page after pthread_mutex_t naturally aligned - * to avoid non alignment faults on non-x86 archs. - */ -#define area_count(___area, ___nr) \ - ((volatile unsigned long long *) ((unsigned long) \ - ((___area) + (___nr)*page_size + \ - sizeof(pthread_mutex_t) + \ - sizeof(unsigned long long) - 1) & \ - ~(unsigned long)(sizeof(unsigned long long) \ - - 1))) - -#define swap(a, b) \ - do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) - -#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1))) - -const char *examples = - "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" - "./userfaultfd anon 100 99999\n\n" - "# Run the same anonymous memory test, but using /dev/userfaultfd:\n" - "./userfaultfd anon:dev 100 99999\n\n" - "# Run share memory test on 1GiB region with 99 bounces:\n" - "./userfaultfd shmem 1000 99\n\n" - "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" - "./userfaultfd hugetlb 256 50\n\n" - "# Run the same hugetlb test but using shared file:\n" - "./userfaultfd hugetlb_shared 256 50\n\n" - "# 10MiB-~6GiB 999 bounces anonymous test, " - "continue forever unless an error triggers\n" - "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; - -static void usage(void) -{ - fprintf(stderr, "\nUsage: ./userfaultfd " - "[hugetlbfs_file]\n\n"); - fprintf(stderr, "Supported : anon, hugetlb, " - "hugetlb_shared, shmem\n\n"); - fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. " - "Supported mods:\n"); - fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n"); - fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n"); - fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n" - "memory\n"); - fprintf(stderr, "\nExample test mod usage:\n"); - fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n"); - fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n"); - - fprintf(stderr, "Examples:\n\n"); - fprintf(stderr, "%s", examples); - exit(1); -} - -#define _err(fmt, ...) \ - do { \ - int ret = errno; \ - fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ - fprintf(stderr, " (errno=%d, line=%d)\n", \ - ret, __LINE__); \ - } while (0) - -#define errexit(exitcode, fmt, ...) \ - do { \ - _err(fmt, ##__VA_ARGS__); \ - exit(exitcode); \ - } while (0) - -#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) - -static void uffd_stats_reset(struct uffd_stats *uffd_stats, - unsigned long n_cpus) -{ - int i; - - for (i = 0; i < n_cpus; i++) { - uffd_stats[i].cpu = i; - uffd_stats[i].missing_faults = 0; - uffd_stats[i].wp_faults = 0; - uffd_stats[i].minor_faults = 0; - } -} - -static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) -{ - int i; - unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; - - for (i = 0; i < n_cpus; i++) { - miss_total += stats[i].missing_faults; - wp_total += stats[i].wp_faults; - minor_total += stats[i].minor_faults; - } - - printf("userfaults: "); - if (miss_total) { - printf("%llu missing (", miss_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].missing_faults); - printf("\b) "); - } - if (wp_total) { - printf("%llu wp (", wp_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].wp_faults); - printf("\b) "); - } - if (minor_total) { - printf("%llu minor (", minor_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].minor_faults); - printf("\b)"); - } - printf("\n"); -} - -static void anon_release_pages(char *rel_area) -{ - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) - err("madvise(MADV_DONTNEED) failed"); -} - -static void anon_allocate_area(void **alloc_area, bool is_src) -{ - *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); -} - -static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) -{ -} - -static void hugetlb_release_pages(char *rel_area) -{ - if (!map_shared) { - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) - err("madvise(MADV_DONTNEED) failed"); - } else { - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) - err("madvise(MADV_REMOVE) failed"); - } -} - -static void hugetlb_allocate_area(void **alloc_area, bool is_src) -{ - off_t size = nr_pages * page_size; - off_t offset = is_src ? 0 : size; - void *area_alias = NULL; - char **alloc_area_alias; - - *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, - (map_shared ? MAP_SHARED : MAP_PRIVATE) | - (is_src ? 0 : MAP_NORESERVE), - mem_fd, offset); - if (*alloc_area == MAP_FAILED) - err("mmap of hugetlbfs file failed"); - - if (map_shared) { - area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_SHARED, mem_fd, offset); - if (area_alias == MAP_FAILED) - err("mmap of hugetlb file alias failed"); - } - - if (is_src) { - alloc_area_alias = &area_src_alias; - } else { - alloc_area_alias = &area_dst_alias; - } - if (area_alias) - *alloc_area_alias = area_alias; -} - -static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) -{ - if (!map_shared) - return; - - *start = (unsigned long) area_dst_alias + offset; -} - -static void shmem_release_pages(char *rel_area) -{ - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) - err("madvise(MADV_REMOVE) failed"); -} - -static void shmem_allocate_area(void **alloc_area, bool is_src) -{ - void *area_alias = NULL; - size_t bytes = nr_pages * page_size; - unsigned long offset = is_src ? 0 : bytes; - char *p = NULL, *p_alias = NULL; - - if (test_collapse) { - p = BASE_PMD_ADDR; - if (!is_src) - /* src map + alias + interleaved hpages */ - p += 2 * (bytes + hpage_size); - p_alias = p; - p_alias += bytes; - p_alias += hpage_size; /* Prevent src/dst VMA merge */ - } - - *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, - mem_fd, offset); - if (*alloc_area == MAP_FAILED) - err("mmap of memfd failed"); - if (test_collapse && *alloc_area != p) - err("mmap of memfd failed at %p", p); - - area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, - mem_fd, offset); - if (area_alias == MAP_FAILED) - err("mmap of memfd alias failed"); - if (test_collapse && area_alias != p_alias) - err("mmap of anonymous memory failed at %p", p_alias); - - if (is_src) - area_src_alias = area_alias; - else - area_dst_alias = area_alias; -} - -static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) -{ - *start = (unsigned long)area_dst_alias + offset; -} - -static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) -{ - if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) - err("Did not find expected %d number of hugepages", - expect_nr_hpages); -} - -struct uffd_test_ops { - void (*allocate_area)(void **alloc_area, bool is_src); - void (*release_pages)(char *rel_area); - void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); - void (*check_pmd_mapping)(void *p, int expect_nr_hpages); -}; - -static struct uffd_test_ops anon_uffd_test_ops = { - .allocate_area = anon_allocate_area, - .release_pages = anon_release_pages, - .alias_mapping = noop_alias_mapping, - .check_pmd_mapping = NULL, -}; - -static struct uffd_test_ops shmem_uffd_test_ops = { - .allocate_area = shmem_allocate_area, - .release_pages = shmem_release_pages, - .alias_mapping = shmem_alias_mapping, - .check_pmd_mapping = shmem_check_pmd_mapping, -}; - -static struct uffd_test_ops hugetlb_uffd_test_ops = { - .allocate_area = hugetlb_allocate_area, - .release_pages = hugetlb_release_pages, - .alias_mapping = hugetlb_alias_mapping, - .check_pmd_mapping = NULL, -}; - -static struct uffd_test_ops *uffd_test_ops; - -static inline uint64_t uffd_minor_feature(void) -{ - if (test_type == TEST_HUGETLB && map_shared) - return UFFD_FEATURE_MINOR_HUGETLBFS; - else if (test_type == TEST_SHMEM) - return UFFD_FEATURE_MINOR_SHMEM; - else - return 0; -} - -static uint64_t get_expected_ioctls(uint64_t mode) -{ - uint64_t ioctls = UFFD_API_RANGE_IOCTLS; - - if (test_type == TEST_HUGETLB) - ioctls &= ~(1 << _UFFDIO_ZEROPAGE); - - if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) - ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); - - if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) - ioctls &= ~(1 << _UFFDIO_CONTINUE); - - return ioctls; -} - -static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) -{ - uint64_t expected = get_expected_ioctls(mode); - uint64_t actual = ioctls & expected; - - if (actual != expected) { - err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, - expected, actual); - } -} - -static int __userfaultfd_open_dev(void) -{ - int fd, _uffd; - - fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); - if (fd < 0) - errexit(KSFT_SKIP, "opening /dev/userfaultfd failed"); - - _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS); - if (_uffd < 0) - errexit(errno == ENOTTY ? KSFT_SKIP : 1, - "creating userfaultfd failed"); - close(fd); - return _uffd; -} - -static void userfaultfd_open(uint64_t *features) -{ - struct uffdio_api uffdio_api; - - if (test_dev_userfaultfd) - uffd = __userfaultfd_open_dev(); - else { - uffd = syscall(__NR_userfaultfd, UFFD_FLAGS); - if (uffd < 0) - errexit(errno == ENOSYS ? KSFT_SKIP : 1, - "creating userfaultfd failed"); - } - uffd_flags = fcntl(uffd, F_GETFD, NULL); - - uffdio_api.api = UFFD_API; - uffdio_api.features = *features; - if (ioctl(uffd, UFFDIO_API, &uffdio_api)) - err("UFFDIO_API failed.\nPlease make sure to " - "run with either root or ptrace capability."); - if (uffdio_api.api != UFFD_API) - err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); - - *features = uffdio_api.features; -} - -static inline void munmap_area(void **area) -{ - if (*area) - if (munmap(*area, nr_pages * page_size)) - err("munmap"); - - *area = NULL; -} - -static void uffd_test_ctx_clear(void) -{ - size_t i; - - if (pipefd) { - for (i = 0; i < nr_cpus * 2; ++i) { - if (close(pipefd[i])) - err("close pipefd"); - } - free(pipefd); - pipefd = NULL; - } - - if (count_verify) { - free(count_verify); - count_verify = NULL; - } - - if (uffd != -1) { - if (close(uffd)) - err("close uffd"); - uffd = -1; - } - - munmap_area((void **)&area_src); - munmap_area((void **)&area_src_alias); - munmap_area((void **)&area_dst); - munmap_area((void **)&area_dst_alias); - munmap_area((void **)&area_remap); -} - -static void uffd_test_ctx_init(uint64_t features) -{ - unsigned long nr, cpu; - - uffd_test_ctx_clear(); - - uffd_test_ops->allocate_area((void **)&area_src, true); - uffd_test_ops->allocate_area((void **)&area_dst, false); - - userfaultfd_open(&features); - - count_verify = malloc(nr_pages * sizeof(unsigned long long)); - if (!count_verify) - err("count_verify"); - - for (nr = 0; nr < nr_pages; nr++) { - *area_mutex(area_src, nr) = - (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; - count_verify[nr] = *area_count(area_src, nr) = 1; - /* - * In the transition between 255 to 256, powerpc will - * read out of order in my_bcmp and see both bytes as - * zero, so leave a placeholder below always non-zero - * after the count, to avoid my_bcmp to trigger false - * positives. - */ - *(area_count(area_src, nr) + 1) = 1; - } - - /* - * After initialization of area_src, we must explicitly release pages - * for area_dst to make sure it's fully empty. Otherwise we could have - * some area_dst pages be errornously initialized with zero pages, - * hence we could hit memory corruption later in the test. - * - * One example is when THP is globally enabled, above allocate_area() - * calls could have the two areas merged into a single VMA (as they - * will have the same VMA flags so they're mergeable). When we - * initialize the area_src above, it's possible that some part of - * area_dst could have been faulted in via one huge THP that will be - * shared between area_src and area_dst. It could cause some of the - * area_dst won't be trapped by missing userfaults. - * - * This release_pages() will guarantee even if that happened, we'll - * proactively split the thp and drop any accidentally initialized - * pages within area_dst. - */ - uffd_test_ops->release_pages(area_dst); - - pipefd = malloc(sizeof(int) * nr_cpus * 2); - if (!pipefd) - err("pipefd"); - for (cpu = 0; cpu < nr_cpus; cpu++) - if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) - err("pipe"); -} - -static int my_bcmp(char *str1, char *str2, size_t n) -{ - unsigned long i; - for (i = 0; i < n; i++) - if (str1[i] != str2[i]) - return 1; - return 0; -} - -static void wp_range(int ufd, __u64 start, __u64 len, bool wp) -{ - struct uffdio_writeprotect prms; - - /* Write protection page faults */ - prms.range.start = start; - prms.range.len = len; - /* Undo write-protect, do wakeup after that */ - prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; - - if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) - err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); -} - -static void continue_range(int ufd, __u64 start, __u64 len) -{ - struct uffdio_continue req; - int ret; - - req.range.start = start; - req.range.len = len; - req.mode = 0; - - if (ioctl(ufd, UFFDIO_CONTINUE, &req)) - err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, - (uint64_t)start); - - /* - * Error handling within the kernel for continue is subtly different - * from copy or zeropage, so it may be a source of bugs. Trigger an - * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. - */ - req.mapped = 0; - ret = ioctl(ufd, UFFDIO_CONTINUE, &req); - if (ret >= 0 || req.mapped != -EEXIST) - err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, - ret, (int64_t) req.mapped); -} - -static void *locking_thread(void *arg) -{ - unsigned long cpu = (unsigned long) arg; - unsigned long page_nr; - unsigned long long count; - - if (!(bounces & BOUNCE_RANDOM)) { - page_nr = -bounces; - if (!(bounces & BOUNCE_RACINGFAULTS)) - page_nr += cpu * nr_pages_per_cpu; - } - - while (!finished) { - if (bounces & BOUNCE_RANDOM) { - if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr)) - err("getrandom failed"); - } else - page_nr += 1; - page_nr %= nr_pages; - pthread_mutex_lock(area_mutex(area_dst, page_nr)); - count = *area_count(area_dst, page_nr); - if (count != count_verify[page_nr]) - err("page_nr %lu memory corruption %llu %llu", - page_nr, count, count_verify[page_nr]); - count++; - *area_count(area_dst, page_nr) = count_verify[page_nr] = count; - pthread_mutex_unlock(area_mutex(area_dst, page_nr)); - } - - return NULL; -} - -static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, - unsigned long offset) -{ - uffd_test_ops->alias_mapping(&uffdio_copy->dst, - uffdio_copy->len, - offset); - if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { - /* real retval in ufdio_copy.copy */ - if (uffdio_copy->copy != -EEXIST) - err("UFFDIO_COPY retry error: %"PRId64, - (int64_t)uffdio_copy->copy); - } else { - err("UFFDIO_COPY retry unexpected: %"PRId64, - (int64_t)uffdio_copy->copy); - } -} - -static void wake_range(int ufd, unsigned long addr, unsigned long len) -{ - struct uffdio_range uffdio_wake; - - uffdio_wake.start = addr; - uffdio_wake.len = len; - - if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) - fprintf(stderr, "error waking %lu\n", - addr), exit(1); -} - -static int __copy_page(int ufd, unsigned long offset, bool retry) -{ - struct uffdio_copy uffdio_copy; - - if (offset >= nr_pages * page_size) - err("unexpected offset %lu\n", offset); - uffdio_copy.dst = (unsigned long) area_dst + offset; - uffdio_copy.src = (unsigned long) area_src + offset; - uffdio_copy.len = page_size; - if (test_uffdio_wp) - uffdio_copy.mode = UFFDIO_COPY_MODE_WP; - else - uffdio_copy.mode = 0; - uffdio_copy.copy = 0; - if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { - /* real retval in ufdio_copy.copy */ - if (uffdio_copy.copy != -EEXIST) - err("UFFDIO_COPY error: %"PRId64, - (int64_t)uffdio_copy.copy); - wake_range(ufd, uffdio_copy.dst, page_size); - } else if (uffdio_copy.copy != page_size) { - err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); - } else { - if (test_uffdio_copy_eexist && retry) { - test_uffdio_copy_eexist = false; - retry_copy_page(ufd, &uffdio_copy, offset); - } - return 1; - } - return 0; -} - -static int copy_page_retry(int ufd, unsigned long offset) -{ - return __copy_page(ufd, offset, true); -} - -static int copy_page(int ufd, unsigned long offset) -{ - return __copy_page(ufd, offset, false); -} - -static int uffd_read_msg(int ufd, struct uffd_msg *msg) -{ - int ret = read(uffd, msg, sizeof(*msg)); - - if (ret != sizeof(*msg)) { - if (ret < 0) { - if (errno == EAGAIN || errno == EINTR) - return 1; - err("blocking read error"); - } else { - err("short read"); - } - } - - return 0; -} - -static void uffd_handle_page_fault(struct uffd_msg *msg, - struct uffd_stats *stats) -{ - unsigned long offset; - - if (msg->event != UFFD_EVENT_PAGEFAULT) - err("unexpected msg event %u", msg->event); - - if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { - /* Write protect page faults */ - wp_range(uffd, msg->arg.pagefault.address, page_size, false); - stats->wp_faults++; - } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { - uint8_t *area; - int b; - - /* - * Minor page faults - * - * To prove we can modify the original range for testing - * purposes, we're going to bit flip this range before - * continuing. - * - * Note that this requires all minor page fault tests operate on - * area_dst (non-UFFD-registered) and area_dst_alias - * (UFFD-registered). - */ - - area = (uint8_t *)(area_dst + - ((char *)msg->arg.pagefault.address - - area_dst_alias)); - for (b = 0; b < page_size; ++b) - area[b] = ~area[b]; - continue_range(uffd, msg->arg.pagefault.address, page_size); - stats->minor_faults++; - } else { - /* - * Missing page faults. - * - * Here we force a write check for each of the missing mode - * faults. It's guaranteed because the only threads that - * will trigger uffd faults are the locking threads, and - * their first instruction to touch the missing page will - * always be pthread_mutex_lock(). - * - * Note that here we relied on an NPTL glibc impl detail to - * always read the lock type at the entry of the lock op - * (pthread_mutex_t.__data.__type, offset 0x10) before - * doing any locking operations to guarantee that. It's - * actually not good to rely on this impl detail because - * logically a pthread-compatible lib can implement the - * locks without types and we can fail when linking with - * them. However since we used to find bugs with this - * strict check we still keep it around. Hopefully this - * could be a good hint when it fails again. If one day - * it'll break on some other impl of glibc we'll revisit. - */ - if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) - err("unexpected write fault"); - - offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; - offset &= ~(page_size-1); - - if (copy_page(uffd, offset)) - stats->missing_faults++; - } -} - -static void *uffd_poll_thread(void *arg) -{ - struct uffd_stats *stats = (struct uffd_stats *)arg; - unsigned long cpu = stats->cpu; - struct pollfd pollfd[2]; - struct uffd_msg msg; - struct uffdio_register uffd_reg; - int ret; - char tmp_chr; - - pollfd[0].fd = uffd; - pollfd[0].events = POLLIN; - pollfd[1].fd = pipefd[cpu*2]; - pollfd[1].events = POLLIN; - - for (;;) { - ret = poll(pollfd, 2, -1); - if (ret <= 0) { - if (errno == EINTR || errno == EAGAIN) - continue; - err("poll error: %d", ret); - } - if (pollfd[1].revents & POLLIN) { - if (read(pollfd[1].fd, &tmp_chr, 1) != 1) - err("read pipefd error"); - break; - } - if (!(pollfd[0].revents & POLLIN)) - err("pollfd[0].revents %d", pollfd[0].revents); - if (uffd_read_msg(uffd, &msg)) - continue; - switch (msg.event) { - default: - err("unexpected msg event %u\n", msg.event); - break; - case UFFD_EVENT_PAGEFAULT: - uffd_handle_page_fault(&msg, stats); - break; - case UFFD_EVENT_FORK: - close(uffd); - uffd = msg.arg.fork.ufd; - pollfd[0].fd = uffd; - break; - case UFFD_EVENT_REMOVE: - uffd_reg.range.start = msg.arg.remove.start; - uffd_reg.range.len = msg.arg.remove.end - - msg.arg.remove.start; - if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) - err("remove failure"); - break; - case UFFD_EVENT_REMAP: - area_remap = area_dst; /* save for later unmap */ - area_dst = (char *)(unsigned long)msg.arg.remap.to; - break; - } - } - - return NULL; -} - -pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; - -static void *uffd_read_thread(void *arg) -{ - struct uffd_stats *stats = (struct uffd_stats *)arg; - struct uffd_msg msg; - - pthread_mutex_unlock(&uffd_read_mutex); - /* from here cancellation is ok */ - - for (;;) { - if (uffd_read_msg(uffd, &msg)) - continue; - uffd_handle_page_fault(&msg, stats); - } - - return NULL; -} - -static void *background_thread(void *arg) -{ - unsigned long cpu = (unsigned long) arg; - unsigned long page_nr, start_nr, mid_nr, end_nr; - - start_nr = cpu * nr_pages_per_cpu; - end_nr = (cpu+1) * nr_pages_per_cpu; - mid_nr = (start_nr + end_nr) / 2; - - /* Copy the first half of the pages */ - for (page_nr = start_nr; page_nr < mid_nr; page_nr++) - copy_page_retry(uffd, page_nr * page_size); - - /* - * If we need to test uffd-wp, set it up now. Then we'll have - * at least the first half of the pages mapped already which - * can be write-protected for testing - */ - if (test_uffdio_wp) - wp_range(uffd, (unsigned long)area_dst + start_nr * page_size, - nr_pages_per_cpu * page_size, true); - - /* - * Continue the 2nd half of the page copying, handling write - * protection faults if any - */ - for (page_nr = mid_nr; page_nr < end_nr; page_nr++) - copy_page_retry(uffd, page_nr * page_size); - - return NULL; -} - -static int stress(struct uffd_stats *uffd_stats) -{ - unsigned long cpu; - pthread_t locking_threads[nr_cpus]; - pthread_t uffd_threads[nr_cpus]; - pthread_t background_threads[nr_cpus]; - - finished = 0; - for (cpu = 0; cpu < nr_cpus; cpu++) { - if (pthread_create(&locking_threads[cpu], &attr, - locking_thread, (void *)cpu)) - return 1; - if (bounces & BOUNCE_POLL) { - if (pthread_create(&uffd_threads[cpu], &attr, - uffd_poll_thread, - (void *)&uffd_stats[cpu])) - return 1; - } else { - if (pthread_create(&uffd_threads[cpu], &attr, - uffd_read_thread, - (void *)&uffd_stats[cpu])) - return 1; - pthread_mutex_lock(&uffd_read_mutex); - } - if (pthread_create(&background_threads[cpu], &attr, - background_thread, (void *)cpu)) - return 1; - } - for (cpu = 0; cpu < nr_cpus; cpu++) - if (pthread_join(background_threads[cpu], NULL)) - return 1; - - /* - * Be strict and immediately zap area_src, the whole area has - * been transferred already by the background treads. The - * area_src could then be faulted in a racy way by still - * running uffdio_threads reading zeropages after we zapped - * area_src (but they're guaranteed to get -EEXIST from - * UFFDIO_COPY without writing zero pages into area_dst - * because the background threads already completed). - */ - uffd_test_ops->release_pages(area_src); - - finished = 1; - for (cpu = 0; cpu < nr_cpus; cpu++) - if (pthread_join(locking_threads[cpu], NULL)) - return 1; - - for (cpu = 0; cpu < nr_cpus; cpu++) { - char c; - if (bounces & BOUNCE_POLL) { - if (write(pipefd[cpu*2+1], &c, 1) != 1) - err("pipefd write error"); - if (pthread_join(uffd_threads[cpu], - (void *)&uffd_stats[cpu])) - return 1; - } else { - if (pthread_cancel(uffd_threads[cpu])) - return 1; - if (pthread_join(uffd_threads[cpu], NULL)) - return 1; - } - } - - return 0; -} - -sigjmp_buf jbuf, *sigbuf; - -static void sighndl(int sig, siginfo_t *siginfo, void *ptr) -{ - if (sig == SIGBUS) { - if (sigbuf) - siglongjmp(*sigbuf, 1); - abort(); - } -} - -/* - * For non-cooperative userfaultfd test we fork() a process that will - * generate pagefaults, will mremap the area monitored by the - * userfaultfd and at last this process will release the monitored - * area. - * For the anonymous and shared memory the area is divided into two - * parts, the first part is accessed before mremap, and the second - * part is accessed after mremap. Since hugetlbfs does not support - * mremap, the entire monitored area is accessed in a single pass for - * HUGETLB_TEST. - * The release of the pages currently generates event for shmem and - * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked - * for hugetlb. - * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register - * monitored area, generate pagefaults and test that signal is delivered. - * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2 - * test robustness use case - we release monitored area, fork a process - * that will generate pagefaults and verify signal is generated. - * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal - * feature. Using monitor thread, verify no userfault events are generated. - */ -static int faulting_process(int signal_test) -{ - unsigned long nr; - unsigned long long count; - unsigned long split_nr_pages; - unsigned long lastnr; - struct sigaction act; - volatile unsigned long signalled = 0; - - split_nr_pages = (nr_pages + 1) / 2; - - if (signal_test) { - sigbuf = &jbuf; - memset(&act, 0, sizeof(act)); - act.sa_sigaction = sighndl; - act.sa_flags = SA_SIGINFO; - if (sigaction(SIGBUS, &act, 0)) - err("sigaction"); - lastnr = (unsigned long)-1; - } - - for (nr = 0; nr < split_nr_pages; nr++) { - volatile int steps = 1; - unsigned long offset = nr * page_size; - - if (signal_test) { - if (sigsetjmp(*sigbuf, 1) != 0) { - if (steps == 1 && nr == lastnr) - err("Signal repeated"); - - lastnr = nr; - if (signal_test == 1) { - if (steps == 1) { - /* This is a MISSING request */ - steps++; - if (copy_page(uffd, offset)) - signalled++; - } else { - /* This is a WP request */ - assert(steps == 2); - wp_range(uffd, - (__u64)area_dst + - offset, - page_size, false); - } - } else { - signalled++; - continue; - } - } - } - - count = *area_count(area_dst, nr); - if (count != count_verify[nr]) - err("nr %lu memory corruption %llu %llu\n", - nr, count, count_verify[nr]); - /* - * Trigger write protection if there is by writing - * the same value back. - */ - *area_count(area_dst, nr) = count; - } - - if (signal_test) - return signalled != split_nr_pages; - - area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, - MREMAP_MAYMOVE | MREMAP_FIXED, area_src); - if (area_dst == MAP_FAILED) - err("mremap"); - /* Reset area_src since we just clobbered it */ - area_src = NULL; - - for (; nr < nr_pages; nr++) { - count = *area_count(area_dst, nr); - if (count != count_verify[nr]) { - err("nr %lu memory corruption %llu %llu\n", - nr, count, count_verify[nr]); - } - /* - * Trigger write protection if there is by writing - * the same value back. - */ - *area_count(area_dst, nr) = count; - } - - uffd_test_ops->release_pages(area_dst); - - for (nr = 0; nr < nr_pages; nr++) - if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) - err("nr %lu is not zero", nr); - - return 0; -} - -static void retry_uffdio_zeropage(int ufd, - struct uffdio_zeropage *uffdio_zeropage, - unsigned long offset) -{ - uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start, - uffdio_zeropage->range.len, - offset); - if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { - if (uffdio_zeropage->zeropage != -EEXIST) - err("UFFDIO_ZEROPAGE error: %"PRId64, - (int64_t)uffdio_zeropage->zeropage); - } else { - err("UFFDIO_ZEROPAGE error: %"PRId64, - (int64_t)uffdio_zeropage->zeropage); - } -} - -static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) -{ - struct uffdio_zeropage uffdio_zeropage; - int ret; - bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE); - __s64 res; - - if (offset >= nr_pages * page_size) - err("unexpected offset %lu", offset); - uffdio_zeropage.range.start = (unsigned long) area_dst + offset; - uffdio_zeropage.range.len = page_size; - uffdio_zeropage.mode = 0; - ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); - res = uffdio_zeropage.zeropage; - if (ret) { - /* real retval in ufdio_zeropage.zeropage */ - if (has_zeropage) - err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res); - else if (res != -EINVAL) - err("UFFDIO_ZEROPAGE not -EINVAL"); - } else if (has_zeropage) { - if (res != page_size) { - err("UFFDIO_ZEROPAGE unexpected size"); - } else { - if (test_uffdio_zeropage_eexist && retry) { - test_uffdio_zeropage_eexist = false; - retry_uffdio_zeropage(ufd, &uffdio_zeropage, - offset); - } - return 1; - } - } else - err("UFFDIO_ZEROPAGE succeeded"); - - return 0; -} - -static int uffdio_zeropage(int ufd, unsigned long offset) -{ - return __uffdio_zeropage(ufd, offset, false); -} - -/* exercise UFFDIO_ZEROPAGE */ -static int userfaultfd_zeropage_test(void) -{ - struct uffdio_register uffdio_register; - - printf("testing UFFDIO_ZEROPAGE: "); - fflush(stdout); - - uffd_test_ctx_init(0); - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (uffdio_zeropage(uffd, 0)) - if (my_bcmp(area_dst, zeropage, page_size)) - err("zeropage is not zero"); - - printf("done.\n"); - return 0; -} - -static int userfaultfd_events_test(void) -{ - struct uffdio_register uffdio_register; - pthread_t uffd_mon; - int err, features; - pid_t pid; - char c; - struct uffd_stats stats = { 0 }; - - printf("testing events (fork, remap, remove): "); - fflush(stdout); - - features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | - UFFD_FEATURE_EVENT_REMOVE; - uffd_test_ctx_init(features); - - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) - err("uffd_poll_thread create"); - - pid = fork(); - if (pid < 0) - err("fork"); - - if (!pid) - exit(faulting_process(0)); - - waitpid(pid, &err, 0); - if (err) - err("faulting process failed"); - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - err("pipe write"); - if (pthread_join(uffd_mon, NULL)) - return 1; - - uffd_stats_report(&stats, 1); - - return stats.missing_faults != nr_pages; -} - -static int userfaultfd_sig_test(void) -{ - struct uffdio_register uffdio_register; - unsigned long userfaults; - pthread_t uffd_mon; - int err, features; - pid_t pid; - char c; - struct uffd_stats stats = { 0 }; - - printf("testing signal delivery: "); - fflush(stdout); - - features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS; - uffd_test_ctx_init(features); - - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (faulting_process(1)) - err("faulting process failed"); - - uffd_test_ops->release_pages(area_dst); - - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) - err("uffd_poll_thread create"); - - pid = fork(); - if (pid < 0) - err("fork"); - - if (!pid) - exit(faulting_process(2)); - - waitpid(pid, &err, 0); - if (err) - err("faulting process failed"); - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - err("pipe write"); - if (pthread_join(uffd_mon, (void **)&userfaults)) - return 1; - - printf("done.\n"); - if (userfaults) - err("Signal test failed, userfaults: %ld", userfaults); - - return userfaults != 0; -} - -void check_memory_contents(char *p) -{ - unsigned long i; - uint8_t expected_byte; - void *expected_page; - - if (posix_memalign(&expected_page, page_size, page_size)) - err("out of memory"); - - for (i = 0; i < nr_pages; ++i) { - expected_byte = ~((uint8_t)(i % ((uint8_t)-1))); - memset(expected_page, expected_byte, page_size); - if (my_bcmp(expected_page, p + (i * page_size), page_size)) - err("unexpected page contents after minor fault"); - } - - free(expected_page); -} - -static int userfaultfd_minor_test(void) -{ - unsigned long p; - struct uffdio_register uffdio_register; - pthread_t uffd_mon; - char c; - struct uffd_stats stats = { 0 }; - - if (!test_uffdio_minor) - return 0; - - printf("testing minor faults: "); - fflush(stdout); - - uffd_test_ctx_init(uffd_minor_feature()); - - uffdio_register.range.start = (unsigned long)area_dst_alias; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - /* - * After registering with UFFD, populate the non-UFFD-registered side of - * the shared mapping. This should *not* trigger any UFFD minor faults. - */ - for (p = 0; p < nr_pages; ++p) { - memset(area_dst + (p * page_size), p % ((uint8_t)-1), - page_size); - } - - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) - err("uffd_poll_thread create"); - - /* - * Read each of the pages back using the UFFD-registered mapping. We - * expect that the first time we touch a page, it will result in a minor - * fault. uffd_poll_thread will resolve the fault by bit-flipping the - * page's contents, and then issuing a CONTINUE ioctl. - */ - check_memory_contents(area_dst_alias); - - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - err("pipe write"); - if (pthread_join(uffd_mon, NULL)) - return 1; - - uffd_stats_report(&stats, 1); - - if (test_collapse) { - printf("testing collapse of uffd memory into PMD-mapped THPs:"); - if (madvise(area_dst_alias, nr_pages * page_size, - MADV_COLLAPSE)) - err("madvise(MADV_COLLAPSE)"); - - uffd_test_ops->check_pmd_mapping(area_dst, - nr_pages * page_size / - hpage_size); - /* - * This won't cause uffd-fault - it purely just makes sure there - * was no corruption. - */ - check_memory_contents(area_dst_alias); - printf(" done.\n"); - } - - return stats.missing_faults != 0 || stats.minor_faults != nr_pages; -} - -#define BIT_ULL(nr) (1ULL << (nr)) -#define PM_SOFT_DIRTY BIT_ULL(55) -#define PM_MMAP_EXCLUSIVE BIT_ULL(56) -#define PM_UFFD_WP BIT_ULL(57) -#define PM_FILE BIT_ULL(61) -#define PM_SWAP BIT_ULL(62) -#define PM_PRESENT BIT_ULL(63) - -static int pagemap_open(void) -{ - int fd = open("/proc/self/pagemap", O_RDONLY); - - if (fd < 0) - err("open pagemap"); - - return fd; -} - -static uint64_t pagemap_read_vaddr(int fd, void *vaddr) -{ - uint64_t value; - int ret; - - ret = pread(fd, &value, sizeof(uint64_t), - ((uint64_t)vaddr >> 12) * sizeof(uint64_t)); - if (ret != sizeof(uint64_t)) - err("pread() on pagemap failed"); - - return value; -} - -/* This macro let __LINE__ works in err() */ -#define pagemap_check_wp(value, wp) do { \ - if (!!(value & PM_UFFD_WP) != wp) \ - err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \ - } while (0) - -static int pagemap_test_fork(bool present) -{ - pid_t child = fork(); - uint64_t value; - int fd, result; - - if (!child) { - /* Open the pagemap fd of the child itself */ - fd = pagemap_open(); - value = pagemap_read_vaddr(fd, area_dst); - /* - * After fork() uffd-wp bit should be gone as long as we're - * without UFFD_FEATURE_EVENT_FORK - */ - pagemap_check_wp(value, false); - /* Succeed */ - exit(0); - } - waitpid(child, &result, 0); - return result; -} - -static void userfaultfd_pagemap_test(unsigned int test_pgsize) -{ - struct uffdio_register uffdio_register; - int pagemap_fd; - uint64_t value; - - /* Pagemap tests uffd-wp only */ - if (!test_uffdio_wp) - return; - - /* Not enough memory to test this page size */ - if (test_pgsize > nr_pages * page_size) - return; - - printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize); - /* Flush so it doesn't flush twice in parent/child later */ - fflush(stdout); - - uffd_test_ctx_init(0); - - if (test_pgsize > page_size) { - /* This is a thp test */ - if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) - err("madvise(MADV_HUGEPAGE) failed"); - } else if (test_pgsize == page_size) { - /* This is normal page test; force no thp */ - if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) - err("madvise(MADV_NOHUGEPAGE) failed"); - } - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failed"); - - pagemap_fd = pagemap_open(); - - /* Touch the page */ - *area_dst = 1; - wp_range(uffd, (uint64_t)area_dst, test_pgsize, true); - value = pagemap_read_vaddr(pagemap_fd, area_dst); - pagemap_check_wp(value, true); - /* Make sure uffd-wp bit dropped when fork */ - if (pagemap_test_fork(true)) - err("Detected stall uffd-wp bit in child"); - - /* Exclusive required or PAGEOUT won't work */ - if (!(value & PM_MMAP_EXCLUSIVE)) - err("multiple mapping detected: 0x%"PRIx64, value); - - if (madvise(area_dst, test_pgsize, MADV_PAGEOUT)) - err("madvise(MADV_PAGEOUT) failed"); - - /* Uffd-wp should persist even swapped out */ - value = pagemap_read_vaddr(pagemap_fd, area_dst); - pagemap_check_wp(value, true); - /* Make sure uffd-wp bit dropped when fork */ - if (pagemap_test_fork(false)) - err("Detected stall uffd-wp bit in child"); - - /* Unprotect; this tests swap pte modifications */ - wp_range(uffd, (uint64_t)area_dst, page_size, false); - value = pagemap_read_vaddr(pagemap_fd, area_dst); - pagemap_check_wp(value, false); - - /* Fault in the page from disk */ - *area_dst = 2; - value = pagemap_read_vaddr(pagemap_fd, area_dst); - pagemap_check_wp(value, false); - - close(pagemap_fd); - printf("done\n"); -} - -static int userfaultfd_stress(void) -{ - void *area; - unsigned long nr; - struct uffdio_register uffdio_register; - struct uffd_stats uffd_stats[nr_cpus]; - - uffd_test_ctx_init(0); - - if (posix_memalign(&area, page_size, page_size)) - err("out of memory"); - zeropage = area; - bzero(zeropage, page_size); - - pthread_mutex_lock(&uffd_read_mutex); - - pthread_attr_init(&attr); - pthread_attr_setstacksize(&attr, 16*1024*1024); - - while (bounces--) { - printf("bounces: %d, mode:", bounces); - if (bounces & BOUNCE_RANDOM) - printf(" rnd"); - if (bounces & BOUNCE_RACINGFAULTS) - printf(" racing"); - if (bounces & BOUNCE_VERIFY) - printf(" ver"); - if (bounces & BOUNCE_POLL) - printf(" poll"); - else - printf(" read"); - printf(", "); - fflush(stdout); - - if (bounces & BOUNCE_POLL) - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - else - fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); - - /* register */ - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (area_dst_alias) { - uffdio_register.range.start = (unsigned long) - area_dst_alias; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure alias"); - } - - /* - * The madvise done previously isn't enough: some - * uffd_thread could have read userfaults (one of - * those already resolved by the background thread) - * and it may be in the process of calling - * UFFDIO_COPY. UFFDIO_COPY will read the zapped - * area_src and it would map a zero page in it (of - * course such a UFFDIO_COPY is perfectly safe as it'd - * return -EEXIST). The problem comes at the next - * bounce though: that racing UFFDIO_COPY would - * generate zeropages in the area_src, so invalidating - * the previous MADV_DONTNEED. Without this additional - * MADV_DONTNEED those zeropages leftovers in the - * area_src would lead to -EEXIST failure during the - * next bounce, effectively leaving a zeropage in the - * area_dst. - * - * Try to comment this out madvise to see the memory - * corruption being caught pretty quick. - * - * khugepaged is also inhibited to collapse THP after - * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's - * required to MADV_DONTNEED here. - */ - uffd_test_ops->release_pages(area_dst); - - uffd_stats_reset(uffd_stats, nr_cpus); - - /* bounce pass */ - if (stress(uffd_stats)) - return 1; - - /* Clear all the write protections if there is any */ - if (test_uffdio_wp) - wp_range(uffd, (unsigned long)area_dst, - nr_pages * page_size, false); - - /* unregister */ - if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) - err("unregister failure"); - if (area_dst_alias) { - uffdio_register.range.start = (unsigned long) area_dst; - if (ioctl(uffd, UFFDIO_UNREGISTER, - &uffdio_register.range)) - err("unregister failure alias"); - } - - /* verification */ - if (bounces & BOUNCE_VERIFY) - for (nr = 0; nr < nr_pages; nr++) - if (*area_count(area_dst, nr) != count_verify[nr]) - err("error area_count %llu %llu %lu\n", - *area_count(area_src, nr), - count_verify[nr], nr); - - /* prepare next bounce */ - swap(area_src, area_dst); - - swap(area_src_alias, area_dst_alias); - - uffd_stats_report(uffd_stats, nr_cpus); - } - - if (test_type == TEST_ANON) { - /* - * shmem/hugetlb won't be able to run since they have different - * behavior on fork() (file-backed memory normally drops ptes - * directly when fork), meanwhile the pagemap test will verify - * pgtable entry of fork()ed child. - */ - userfaultfd_pagemap_test(page_size); - /* - * Hard-code for x86_64 for now for 2M THP, as x86_64 is - * currently the only one that supports uffd-wp - */ - userfaultfd_pagemap_test(page_size * 512); - } - - return userfaultfd_zeropage_test() || userfaultfd_sig_test() - || userfaultfd_events_test() || userfaultfd_minor_test(); -} - -/* - * Copied from mlock2-tests.c - */ -unsigned long default_huge_page_size(void) -{ - unsigned long hps = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - - if (!f) - return 0; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { - hps <<= 10; - break; - } - } - - free(line); - fclose(f); - return hps; -} - -static void set_test_type(const char *type) -{ - if (!strcmp(type, "anon")) { - test_type = TEST_ANON; - uffd_test_ops = &anon_uffd_test_ops; - } else if (!strcmp(type, "hugetlb")) { - test_type = TEST_HUGETLB; - uffd_test_ops = &hugetlb_uffd_test_ops; - } else if (!strcmp(type, "hugetlb_shared")) { - map_shared = true; - test_type = TEST_HUGETLB; - uffd_test_ops = &hugetlb_uffd_test_ops; - /* Minor faults require shared hugetlb; only enable here. */ - test_uffdio_minor = true; - } else if (!strcmp(type, "shmem")) { - map_shared = true; - test_type = TEST_SHMEM; - uffd_test_ops = &shmem_uffd_test_ops; - test_uffdio_minor = true; - } -} - -static void parse_test_type_arg(const char *raw_type) -{ - char *buf = strdup(raw_type); - uint64_t features = UFFD_API_FEATURES; - - while (buf) { - const char *token = strsep(&buf, ":"); - - if (!test_type) - set_test_type(token); - else if (!strcmp(token, "dev")) - test_dev_userfaultfd = true; - else if (!strcmp(token, "syscall")) - test_dev_userfaultfd = false; - else if (!strcmp(token, "collapse")) - test_collapse = true; - else - err("unrecognized test mod '%s'", token); - } - - if (!test_type) - err("failed to parse test type argument: '%s'", raw_type); - - if (test_collapse && test_type != TEST_SHMEM) - err("Unsupported test: %s", raw_type); - - if (test_type == TEST_HUGETLB) - page_size = hpage_size; - else - page_size = sysconf(_SC_PAGE_SIZE); - - if (!page_size) - err("Unable to determine page size"); - if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 - > page_size) - err("Impossible to run this test"); - - /* - * Whether we can test certain features depends not just on test type, - * but also on whether or not this particular kernel supports the - * feature. - */ - - userfaultfd_open(&features); - - test_uffdio_wp = test_uffdio_wp && - (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); - test_uffdio_minor = test_uffdio_minor && - (features & uffd_minor_feature()); - - close(uffd); - uffd = -1; -} - -static void sigalrm(int sig) -{ - if (sig != SIGALRM) - abort(); - test_uffdio_copy_eexist = true; - test_uffdio_zeropage_eexist = true; - alarm(ALARM_INTERVAL_SECS); -} - -int main(int argc, char **argv) -{ - size_t bytes; - - if (argc < 4) - usage(); - - if (signal(SIGALRM, sigalrm) == SIG_ERR) - err("failed to arm SIGALRM"); - alarm(ALARM_INTERVAL_SECS); - - hpage_size = default_huge_page_size(); - parse_test_type_arg(argv[1]); - bytes = atol(argv[2]) * 1024 * 1024; - - if (test_collapse && bytes & (hpage_size - 1)) - err("MiB must be multiple of %lu if :collapse mod set", - hpage_size >> 20); - - nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - - if (test_collapse) { - /* nr_cpus must divide (bytes / page_size), otherwise, - * area allocations of (nr_pages * paze_size) won't be a - * multiple of hpage_size, even if bytes is a multiple of - * hpage_size. - * - * This means that nr_cpus must divide (N * (2 << (H-P)) - * where: - * bytes = hpage_size * N - * hpage_size = 2 << H - * page_size = 2 << P - * - * And we want to chose nr_cpus to be the largest value - * satisfying this constraint, not larger than the number - * of online CPUs. Unfortunately, prime factorization of - * N and nr_cpus may be arbitrary, so have to search for it. - * Instead, just use the highest power of 2 dividing both - * nr_cpus and (bytes / page_size). - */ - int x = factor_of_2(nr_cpus); - int y = factor_of_2(bytes / page_size); - - nr_cpus = x < y ? x : y; - } - nr_pages_per_cpu = bytes / page_size / nr_cpus; - if (!nr_pages_per_cpu) { - _err("invalid MiB"); - usage(); - } - - bounces = atoi(argv[3]); - if (bounces <= 0) { - _err("invalid bounces"); - usage(); - } - nr_pages = nr_pages_per_cpu * nr_cpus; - - if (test_type == TEST_SHMEM || test_type == TEST_HUGETLB) { - unsigned int memfd_flags = 0; - - if (test_type == TEST_HUGETLB) - memfd_flags = MFD_HUGETLB; - mem_fd = memfd_create(argv[0], memfd_flags); - if (mem_fd < 0) - err("memfd_create"); - if (ftruncate(mem_fd, nr_pages * page_size * 2)) - err("ftruncate"); - if (fallocate(mem_fd, - FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, - nr_pages * page_size * 2)) - err("fallocate"); - } - printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", - nr_pages, nr_pages_per_cpu); - return userfaultfd_stress(); -} - -#else /* __NR_userfaultfd */ - -#warning "missing __NR_userfaultfd definition" - -int main(void) -{ - printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); - return KSFT_SKIP; -} - -#endif /* __NR_userfaultfd */ diff --git a/tools/testing/selftests/vm/util.h b/tools/testing/selftests/vm/util.h deleted file mode 100644 index b27d26199334..000000000000 --- a/tools/testing/selftests/vm/util.h +++ /dev/null @@ -1,69 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef __KSELFTEST_VM_UTIL_H -#define __KSELFTEST_VM_UTIL_H - -#include -#include -#include -#include /* ffsl() */ -#include /* _SC_PAGESIZE */ - -static unsigned int __page_size; -static unsigned int __page_shift; - -static inline unsigned int page_size(void) -{ - if (!__page_size) - __page_size = sysconf(_SC_PAGESIZE); - return __page_size; -} - -static inline unsigned int page_shift(void) -{ - if (!__page_shift) - __page_shift = (ffsl(page_size()) - 1); - return __page_shift; -} - -#define PAGE_SHIFT (page_shift()) -#define PAGE_SIZE (page_size()) -/* - * On ppc64 this will only work with radix 2M hugepage size - */ -#define HPAGE_SHIFT 21 -#define HPAGE_SIZE (1 << HPAGE_SHIFT) - -#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0) -#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) - - -static inline int64_t allocate_transhuge(void *ptr, int pagemap_fd) -{ - uint64_t ent[2]; - - /* drop pmd */ - if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_ANONYMOUS | - MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr) - errx(2, "mmap transhuge"); - - if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE)) - err(2, "MADV_HUGEPAGE"); - - /* allocate transparent huge page */ - *(volatile void **)ptr = ptr; - - if (pread(pagemap_fd, ent, sizeof(ent), - (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent)) - err(2, "read pagemap"); - - if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) && - PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) && - !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1))) - return PAGEMAP_PFN(ent[0]); - - return -1; -} - -#endif diff --git a/tools/testing/selftests/vm/va_128TBswitch.c b/tools/testing/selftests/vm/va_128TBswitch.c deleted file mode 100644 index 1d2068989883..000000000000 --- a/tools/testing/selftests/vm/va_128TBswitch.c +++ /dev/null @@ -1,289 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * - * Authors: Kirill A. Shutemov - * Authors: Aneesh Kumar K.V - */ - -#include -#include -#include - -#include "../kselftest.h" - -#ifdef __powerpc64__ -#define PAGE_SIZE (64 << 10) -/* - * This will work with 16M and 2M hugepage size - */ -#define HUGETLB_SIZE (16 << 20) -#else -#define PAGE_SIZE (4 << 10) -#define HUGETLB_SIZE (2 << 20) -#endif - -/* - * >= 128TB is the hint addr value we used to select - * large address space. - */ -#define ADDR_SWITCH_HINT (1UL << 47) -#define LOW_ADDR ((void *) (1UL << 30)) -#define HIGH_ADDR ((void *) (1UL << 48)) - -struct testcase { - void *addr; - unsigned long size; - unsigned long flags; - const char *msg; - unsigned int low_addr_required:1; - unsigned int keep_mapped:1; -}; - -static struct testcase testcases[] = { - { - /* - * If stack is moved, we could possibly allocate - * this at the requested address. - */ - .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", - .low_addr_required = 1, - }, - { - /* - * We should never allocate at the requested address or above it - * The len cross the 128TB boundary. Without MAP_FIXED - * we will always search in the lower address space. - */ - .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))", - .low_addr_required = 1, - }, - { - /* - * Exact mapping at 128TB, the area is free we should get that - * even without MAP_FIXED. - */ - .addr = ((void *)(ADDR_SWITCH_HINT)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", - .keep_mapped = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", - }, - { - .addr = NULL, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(NULL)", - .low_addr_required = 1, - }, - { - .addr = LOW_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(LOW_ADDR)", - .low_addr_required = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR)", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR) again", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(HIGH_ADDR, MAP_FIXED)", - }, - { - .addr = (void *) -1, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1)", - .keep_mapped = 1, - }, - { - .addr = (void *) -1, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1) again", - }, - { - .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", - .low_addr_required = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = ((void *)(ADDR_SWITCH_HINT)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", - }, - { - .addr = (void *)(ADDR_SWITCH_HINT), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", - }, -}; - -static struct testcase hugetlb_testcases[] = { - { - .addr = NULL, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(NULL, MAP_HUGETLB)", - .low_addr_required = 1, - }, - { - .addr = LOW_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(LOW_ADDR, MAP_HUGETLB)", - .low_addr_required = 1, - }, - { - .addr = HIGH_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)", - }, - { - .addr = (void *) -1, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1, MAP_HUGETLB)", - .keep_mapped = 1, - }, - { - .addr = (void *) -1, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1, MAP_HUGETLB) again", - }, - { - .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), - .size = 2 * HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT), - .size = 2 * HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)", - }, -}; - -static int run_test(struct testcase *test, int count) -{ - void *p; - int i, ret = KSFT_PASS; - - for (i = 0; i < count; i++) { - struct testcase *t = test + i; - - p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0); - - printf("%s: %p - ", t->msg, p); - - if (p == MAP_FAILED) { - printf("FAILED\n"); - ret = KSFT_FAIL; - continue; - } - - if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) { - printf("FAILED\n"); - ret = KSFT_FAIL; - } else { - /* - * Do a dereference of the address returned so that we catch - * bugs in page fault handling - */ - memset(p, 0, t->size); - printf("OK\n"); - } - if (!t->keep_mapped) - munmap(p, t->size); - } - - return ret; -} - -static int supported_arch(void) -{ -#if defined(__powerpc64__) - return 1; -#elif defined(__x86_64__) - return 1; -#else - return 0; -#endif -} - -int main(int argc, char **argv) -{ - int ret; - - if (!supported_arch()) - return KSFT_SKIP; - - ret = run_test(testcases, ARRAY_SIZE(testcases)); - if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) - ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases)); - return ret; -} diff --git a/tools/testing/selftests/vm/va_128TBswitch.sh b/tools/testing/selftests/vm/va_128TBswitch.sh deleted file mode 100755 index 41580751dc51..000000000000 --- a/tools/testing/selftests/vm/va_128TBswitch.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Copyright (C) 2022 Adam Sindelar (Meta) -# -# This is a test for mmap behavior with 5-level paging. This script wraps the -# real test to check that the kernel is configured to support at least 5 -# pagetable levels. - -# 1 means the test failed -exitcode=1 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -fail() -{ - echo "$1" - exit $exitcode -} - -check_supported_x86_64() -{ - local config="/proc/config.gz" - [[ -f "${config}" ]] || config="/boot/config-$(uname -r)" - [[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot" - - # gzip -dcfq automatically handles both compressed and plaintext input. - # See man 1 gzip under '-f'. - local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2) - - if [[ "${pg_table_levels}" -lt 5 ]]; then - echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test" - exit $ksft_skip - fi -} - -check_test_requirements() -{ - # The test supports x86_64 and powerpc64. We currently have no useful - # eligibility check for powerpc64, and the test itself will reject other - # architectures. - case `uname -m` in - "x86_64") - check_supported_x86_64 - ;; - *) - return 0 - ;; - esac -} - -check_test_requirements -./va_128TBswitch diff --git a/tools/testing/selftests/vm/virtual_address_range.c b/tools/testing/selftests/vm/virtual_address_range.c deleted file mode 100644 index c0592646ed93..000000000000 --- a/tools/testing/selftests/vm/virtual_address_range.c +++ /dev/null @@ -1,139 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2017, Anshuman Khandual, IBM Corp. - * - * Works on architectures which support 128TB virtual - * address range and beyond. - */ -#include -#include -#include -#include -#include -#include -#include - -/* - * Maximum address range mapped with a single mmap() - * call is little bit more than 16GB. Hence 16GB is - * chosen as the single chunk size for address space - * mapping. - */ -#define MAP_CHUNK_SIZE 17179869184UL /* 16GB */ - -/* - * Address space till 128TB is mapped without any hint - * and is enabled by default. Address space beyond 128TB - * till 512TB is obtained by passing hint address as the - * first argument into mmap() system call. - * - * The process heap address space is divided into two - * different areas one below 128TB and one above 128TB - * till it reaches 512TB. One with size 128TB and the - * other being 384TB. - * - * On Arm64 the address space is 256TB and no high mappings - * are supported so far. - */ - -#define NR_CHUNKS_128TB 8192UL /* Number of 16GB chunks for 128TB */ -#define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL) -#define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL) - -#define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */ -#define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */ - -#ifdef __aarch64__ -#define HIGH_ADDR_MARK ADDR_MARK_256TB -#define HIGH_ADDR_SHIFT 49 -#define NR_CHUNKS_LOW NR_CHUNKS_256TB -#define NR_CHUNKS_HIGH 0 -#else -#define HIGH_ADDR_MARK ADDR_MARK_128TB -#define HIGH_ADDR_SHIFT 48 -#define NR_CHUNKS_LOW NR_CHUNKS_128TB -#define NR_CHUNKS_HIGH NR_CHUNKS_384TB -#endif - -static char *hind_addr(void) -{ - int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT); - - return (char *) (1UL << bits); -} - -static int validate_addr(char *ptr, int high_addr) -{ - unsigned long addr = (unsigned long) ptr; - - if (high_addr) { - if (addr < HIGH_ADDR_MARK) { - printf("Bad address %lx\n", addr); - return 1; - } - return 0; - } - - if (addr > HIGH_ADDR_MARK) { - printf("Bad address %lx\n", addr); - return 1; - } - return 0; -} - -static int validate_lower_address_hint(void) -{ - char *ptr; - - ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ | - PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (ptr == MAP_FAILED) - return 0; - - return 1; -} - -int main(int argc, char *argv[]) -{ - char *ptr[NR_CHUNKS_LOW]; - char *hptr[NR_CHUNKS_HIGH]; - char *hint; - unsigned long i, lchunks, hchunks; - - for (i = 0; i < NR_CHUNKS_LOW; i++) { - ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (ptr[i] == MAP_FAILED) { - if (validate_lower_address_hint()) - return 1; - break; - } - - if (validate_addr(ptr[i], 0)) - return 1; - } - lchunks = i; - - for (i = 0; i < NR_CHUNKS_HIGH; i++) { - hint = hind_addr(); - hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (hptr[i] == MAP_FAILED) - break; - - if (validate_addr(hptr[i], 1)) - return 1; - } - hchunks = i; - - for (i = 0; i < lchunks; i++) - munmap(ptr[i], MAP_CHUNK_SIZE); - - for (i = 0; i < hchunks; i++) - munmap(hptr[i], MAP_CHUNK_SIZE); - - return 0; -} diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c deleted file mode 100644 index 40e795624ff3..000000000000 --- a/tools/testing/selftests/vm/vm_util.c +++ /dev/null @@ -1,151 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include "../kselftest.h" -#include "vm_util.h" - -#define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" -#define SMAP_FILE_PATH "/proc/self/smaps" -#define MAX_LINE_LENGTH 500 - -uint64_t pagemap_get_entry(int fd, char *start) -{ - const unsigned long pfn = (unsigned long)start / getpagesize(); - uint64_t entry; - int ret; - - ret = pread(fd, &entry, sizeof(entry), pfn * sizeof(entry)); - if (ret != sizeof(entry)) - ksft_exit_fail_msg("reading pagemap failed\n"); - return entry; -} - -bool pagemap_is_softdirty(int fd, char *start) -{ - uint64_t entry = pagemap_get_entry(fd, start); - - // Check if dirty bit (55th bit) is set - return entry & 0x0080000000000000ull; -} - -bool pagemap_is_swapped(int fd, char *start) -{ - uint64_t entry = pagemap_get_entry(fd, start); - - return entry & 0x4000000000000000ull; -} - -bool pagemap_is_populated(int fd, char *start) -{ - uint64_t entry = pagemap_get_entry(fd, start); - - /* Present or swapped. */ - return entry & 0xc000000000000000ull; -} - -unsigned long pagemap_get_pfn(int fd, char *start) -{ - uint64_t entry = pagemap_get_entry(fd, start); - - /* If present (63th bit), PFN is at bit 0 -- 54. */ - if (entry & 0x8000000000000000ull) - return entry & 0x007fffffffffffffull; - return -1ul; -} - -void clear_softdirty(void) -{ - int ret; - const char *ctrl = "4"; - int fd = open("/proc/self/clear_refs", O_WRONLY); - - if (fd < 0) - ksft_exit_fail_msg("opening clear_refs failed\n"); - ret = write(fd, ctrl, strlen(ctrl)); - close(fd); - if (ret != strlen(ctrl)) - ksft_exit_fail_msg("writing clear_refs failed\n"); -} - -bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len) -{ - while (fgets(buf, len, fp)) { - if (!strncmp(buf, pattern, strlen(pattern))) - return true; - } - return false; -} - -uint64_t read_pmd_pagesize(void) -{ - int fd; - char buf[20]; - ssize_t num_read; - - fd = open(PMD_SIZE_FILE_PATH, O_RDONLY); - if (fd == -1) - ksft_exit_fail_msg("Open hpage_pmd_size failed\n"); - - num_read = read(fd, buf, 19); - if (num_read < 1) { - close(fd); - ksft_exit_fail_msg("Read hpage_pmd_size failed\n"); - } - buf[num_read] = '\0'; - close(fd); - - return strtoul(buf, NULL, 10); -} - -bool __check_huge(void *addr, char *pattern, int nr_hpages, - uint64_t hpage_size) -{ - uint64_t thp = -1; - int ret; - FILE *fp; - char buffer[MAX_LINE_LENGTH]; - char addr_pattern[MAX_LINE_LENGTH]; - - ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", - (unsigned long) addr); - if (ret >= MAX_LINE_LENGTH) - ksft_exit_fail_msg("%s: Pattern is too long\n", __func__); - - fp = fopen(SMAP_FILE_PATH, "r"); - if (!fp) - ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH); - - if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) - goto err_out; - - /* - * Fetch the pattern in the same block and check the number of - * hugepages. - */ - if (!check_for_pattern(fp, pattern, buffer, sizeof(buffer))) - goto err_out; - - snprintf(addr_pattern, MAX_LINE_LENGTH, "%s%%9ld kB", pattern); - - if (sscanf(buffer, addr_pattern, &thp) != 1) - ksft_exit_fail_msg("Reading smap error\n"); - -err_out: - fclose(fp); - return thp == (nr_hpages * (hpage_size >> 10)); -} - -bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size) -{ - return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size); -} - -bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size) -{ - return __check_huge(addr, "FilePmdMapped:", nr_hpages, hpage_size); -} - -bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size) -{ - return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size); -} diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h deleted file mode 100644 index 1995ee911ef2..000000000000 --- a/tools/testing/selftests/vm/vm_util.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include -#include - -uint64_t pagemap_get_entry(int fd, char *start); -bool pagemap_is_softdirty(int fd, char *start); -bool pagemap_is_swapped(int fd, char *start); -bool pagemap_is_populated(int fd, char *start); -unsigned long pagemap_get_pfn(int fd, char *start); -void clear_softdirty(void); -bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); -uint64_t read_pmd_pagesize(void); -bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); -bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); -bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); diff --git a/tools/testing/selftests/vm/write_hugetlb_memory.sh b/tools/testing/selftests/vm/write_hugetlb_memory.sh deleted file mode 100644 index 70a02301f4c2..000000000000 --- a/tools/testing/selftests/vm/write_hugetlb_memory.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -set -e - -size=$1 -populate=$2 -write=$3 -cgroup=$4 -path=$5 -method=$6 -private=$7 -want_sleep=$8 -reserve=$9 - -echo "Putting task in cgroup '$cgroup'" -echo $$ > ${cgroup_path:-/dev/cgroup/memory}/"$cgroup"/cgroup.procs - -echo "Method is $method" - -set +e -./write_to_hugetlbfs -p "$path" -s "$size" "$write" "$populate" -m "$method" \ - "$private" "$want_sleep" "$reserve" diff --git a/tools/testing/selftests/vm/write_to_hugetlbfs.c b/tools/testing/selftests/vm/write_to_hugetlbfs.c deleted file mode 100644 index 6a2caba19ee1..000000000000 --- a/tools/testing/selftests/vm/write_to_hugetlbfs.c +++ /dev/null @@ -1,240 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * This program reserves and uses hugetlb memory, supporting a bunch of - * scenarios needed by the charged_reserved_hugetlb.sh test. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Global definitions. */ -enum method { - HUGETLBFS, - MMAP_MAP_HUGETLB, - SHM, - MAX_METHOD -}; - - -/* Global variables. */ -static const char *self; -static char *shmaddr; -static int shmid; - -/* - * Show usage and exit. - */ -static void exit_usage(void) -{ - printf("Usage: %s -p -s " - "[-m <0=hugetlbfs | 1=mmap(MAP_HUGETLB)>] [-l] [-r] " - "[-o] [-w] [-n]\n", - self); - exit(EXIT_FAILURE); -} - -void sig_handler(int signo) -{ - printf("Received %d.\n", signo); - if (signo == SIGINT) { - printf("Deleting the memory\n"); - if (shmdt((const void *)shmaddr) != 0) { - perror("Detach failure"); - shmctl(shmid, IPC_RMID, NULL); - exit(4); - } - - shmctl(shmid, IPC_RMID, NULL); - printf("Done deleting the memory\n"); - } - exit(2); -} - -int main(int argc, char **argv) -{ - int fd = 0; - int key = 0; - int *ptr = NULL; - int c = 0; - int size = 0; - char path[256] = ""; - enum method method = MAX_METHOD; - int want_sleep = 0, private = 0; - int populate = 0; - int write = 0; - int reserve = 1; - - if (signal(SIGINT, sig_handler) == SIG_ERR) - err(1, "\ncan't catch SIGINT\n"); - - /* Parse command-line arguments. */ - setvbuf(stdout, NULL, _IONBF, 0); - self = argv[0]; - - while ((c = getopt(argc, argv, "s:p:m:owlrn")) != -1) { - switch (c) { - case 's': - size = atoi(optarg); - break; - case 'p': - strncpy(path, optarg, sizeof(path)); - break; - case 'm': - if (atoi(optarg) >= MAX_METHOD) { - errno = EINVAL; - perror("Invalid -m."); - exit_usage(); - } - method = atoi(optarg); - break; - case 'o': - populate = 1; - break; - case 'w': - write = 1; - break; - case 'l': - want_sleep = 1; - break; - case 'r': - private - = 1; - break; - case 'n': - reserve = 0; - break; - default: - errno = EINVAL; - perror("Invalid arg"); - exit_usage(); - } - } - - if (strncmp(path, "", sizeof(path)) != 0) { - printf("Writing to this path: %s\n", path); - } else { - errno = EINVAL; - perror("path not found"); - exit_usage(); - } - - if (size != 0) { - printf("Writing this size: %d\n", size); - } else { - errno = EINVAL; - perror("size not found"); - exit_usage(); - } - - if (!populate) - printf("Not populating.\n"); - else - printf("Populating.\n"); - - if (!write) - printf("Not writing to memory.\n"); - - if (method == MAX_METHOD) { - errno = EINVAL; - perror("-m Invalid"); - exit_usage(); - } else - printf("Using method=%d\n", method); - - if (!private) - printf("Shared mapping.\n"); - else - printf("Private mapping.\n"); - - if (!reserve) - printf("NO_RESERVE mapping.\n"); - else - printf("RESERVE mapping.\n"); - - switch (method) { - case HUGETLBFS: - printf("Allocating using HUGETLBFS.\n"); - fd = open(path, O_CREAT | O_RDWR, 0777); - if (fd == -1) - err(1, "Failed to open file."); - - ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, - (private ? MAP_PRIVATE : MAP_SHARED) | - (populate ? MAP_POPULATE : 0) | - (reserve ? 0 : MAP_NORESERVE), - fd, 0); - - if (ptr == MAP_FAILED) { - close(fd); - err(1, "Error mapping the file"); - } - break; - case MMAP_MAP_HUGETLB: - printf("Allocating using MAP_HUGETLB.\n"); - ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, - (private ? (MAP_PRIVATE | MAP_ANONYMOUS) : - MAP_SHARED) | - MAP_HUGETLB | (populate ? MAP_POPULATE : 0) | - (reserve ? 0 : MAP_NORESERVE), - -1, 0); - - if (ptr == MAP_FAILED) - err(1, "mmap"); - - printf("Returned address is %p\n", ptr); - break; - case SHM: - printf("Allocating using SHM.\n"); - shmid = shmget(key, size, - SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); - if (shmid < 0) { - shmid = shmget(++key, size, - SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); - if (shmid < 0) - err(1, "shmget"); - } - printf("shmid: 0x%x, shmget key:%d\n", shmid, key); - - ptr = shmat(shmid, NULL, 0); - if (ptr == (int *)-1) { - perror("Shared memory attach failure"); - shmctl(shmid, IPC_RMID, NULL); - exit(2); - } - printf("shmaddr: %p\n", ptr); - - break; - default: - errno = EINVAL; - err(1, "Invalid method."); - } - - if (write) { - printf("Writing to memory.\n"); - memset(ptr, 1, size); - } - - if (want_sleep) { - /* Signal to caller that we're done. */ - printf("DONE\n"); - - /* Hold memory until external kill signal is delivered. */ - while (1) - sleep(100); - } - - if (method == HUGETLBFS) - close(fd); - - return 0; -} -- cgit v1.2.3 From da0618c146ca0e1412173a8a229dd737a73b1a4f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 2 Jan 2023 16:11:26 +0000 Subject: selftest/vm: add mremap expand merge offset test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a test to assert that we can mremap() and expand a mapping starting from an offset within an existing mapping. We unmap the last page in a 3 page mapping to ensure that the remap should always succeed, before remapping from the 2nd page. This is additionally a regression test for the issue solved in "mm, mremap: fix mremap() expanding vma with addr inside vma" and confirmed to fail prior to the change and pass after it. Finally, this patch updates the existing mremap expand merge test to check error conditions and reduce code duplication between the two tests. [lstoakes@gmail.com: increment num_expand_tests so test doesn't complain about unexpected tests being run] Link: https://lkml.kernel.org/r/8ff3ba3cadc0b6c1b2688ae5c851bf73aa062d57.1673701836.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/02b117a8ffd52acc01dc66c2fb39754f08d92c0e.1672675824.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Cc: Jakub Matěna Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mremap_test.c | 119 +++++++++++++++++++++++++------ 1 file changed, 96 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 9496346973d4..5c3773de9f0f 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -119,47 +119,109 @@ static unsigned long long get_mmap_min_addr(void) } /* - * This test validates that merge is called when expanding a mapping. - * Mapping containing three pages is created, middle page is unmapped - * and then the mapping containing the first page is expanded so that - * it fills the created hole. The two parts should merge creating - * single mapping with three pages. + * Using /proc/self/maps, assert that the specified address range is contained + * within a single mapping. */ -static void mremap_expand_merge(unsigned long page_size) +static bool is_range_mapped(FILE *maps_fp, void *start, void *end) { - char *test_name = "mremap expand merge"; - FILE *fp; char *line = NULL; size_t len = 0; bool success = false; - char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - munmap(start + page_size, page_size); - mremap(start, page_size, 2 * page_size, 0); - fp = fopen("/proc/self/maps", "r"); - if (fp == NULL) { - ksft_test_result_fail("%s\n", test_name); - return; - } + rewind(maps_fp); - while (getline(&line, &len, fp) != -1) { + while (getline(&line, &len, maps_fp) != -1) { char *first = strtok(line, "- "); void *first_val = (void *)strtol(first, NULL, 16); char *second = strtok(NULL, "- "); void *second_val = (void *) strtol(second, NULL, 16); - if (first_val == start && second_val == start + 3 * page_size) { + if (first_val <= start && second_val >= end) { success = true; break; } } + + return success; +} + +/* + * This test validates that merge is called when expanding a mapping. + * Mapping containing three pages is created, middle page is unmapped + * and then the mapping containing the first page is expanded so that + * it fills the created hole. The two parts should merge creating + * single mapping with three pages. + */ +static void mremap_expand_merge(FILE *maps_fp, unsigned long page_size) +{ + char *test_name = "mremap expand merge"; + bool success = false; + char *remap, *start; + + start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (start == MAP_FAILED) { + ksft_print_msg("mmap failed: %s\n", strerror(errno)); + goto out; + } + + munmap(start + page_size, page_size); + remap = mremap(start, page_size, 2 * page_size, 0); + if (remap == MAP_FAILED) { + ksft_print_msg("mremap failed: %s\n", strerror(errno)); + munmap(start, page_size); + munmap(start + 2 * page_size, page_size); + goto out; + } + + success = is_range_mapped(maps_fp, start, start + 3 * page_size); + munmap(start, 3 * page_size); + +out: + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); +} + +/* + * Similar to mremap_expand_merge() except instead of removing the middle page, + * we remove the last then attempt to remap offset from the second page. This + * should result in the mapping being restored to its former state. + */ +static void mremap_expand_merge_offset(FILE *maps_fp, unsigned long page_size) +{ + + char *test_name = "mremap expand merge offset"; + bool success = false; + char *remap, *start; + + start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (start == MAP_FAILED) { + ksft_print_msg("mmap failed: %s\n", strerror(errno)); + goto out; + } + + /* Unmap final page to ensure we have space to expand. */ + munmap(start + 2 * page_size, page_size); + remap = mremap(start + page_size, page_size, 2 * page_size, 0); + if (remap == MAP_FAILED) { + ksft_print_msg("mremap failed: %s\n", strerror(errno)); + munmap(start, 2 * page_size); + goto out; + } + + success = is_range_mapped(maps_fp, start, start + 3 * page_size); + munmap(start, 3 * page_size); + +out: if (success) ksft_test_result_pass("%s\n", test_name); else ksft_test_result_fail("%s\n", test_name); - fclose(fp); } /* @@ -380,11 +442,12 @@ int main(int argc, char **argv) int i, run_perf_tests; unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; unsigned int pattern_seed; - int num_expand_tests = 1; + int num_expand_tests = 2; struct test test_cases[MAX_TEST]; struct test perf_test_cases[MAX_PERF_TEST]; int page_size; time_t t; + FILE *maps_fp; pattern_seed = (unsigned int) time(&t); @@ -458,7 +521,17 @@ int main(int argc, char **argv) run_mremap_test_case(test_cases[i], &failures, threshold_mb, pattern_seed); - mremap_expand_merge(page_size); + maps_fp = fopen("/proc/self/maps", "r"); + + if (maps_fp == NULL) { + ksft_print_msg("Failed to read /proc/self/maps: %s\n", strerror(errno)); + exit(KSFT_FAIL); + } + + mremap_expand_merge(maps_fp, page_size); + mremap_expand_merge_offset(maps_fp, page_size); + + fclose(maps_fp); if (run_perf_tests) { ksft_print_msg("\n%s\n", -- cgit v1.2.3 From f4d9139f1394cbe2de158ab8771fea4e587004d4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 Jan 2023 18:12:55 +0100 Subject: selftests/mm: define MADV_PAGEOUT to fix compilation issues If MADV_PAGEOUT is not defined (e.g., on AlmaLinux 8), compilation will fail. Let's fix that like khugepaged.c does by conditionally defining MADV_PAGEOUT. Link: https://lkml.kernel.org/r/20230109171255.488749-1-david@redhat.com Fixes: 69c66add5663 ("selftests/vm: anon_cow: test COW handling of anonymous memory") Signed-off-by: David Hildenbrand Reported-by: Mirsad Goran Todorovac Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index 16216d893d96..0eb2e8180aa5 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -30,6 +30,9 @@ #include "../kselftest.h" #include "vm_util.h" +#ifndef MADV_PAGEOUT +#define MADV_PAGEOUT 21 +#endif #ifndef MADV_COLLAPSE #define MADV_COLLAPSE 25 #endif -- cgit v1.2.3 From d526643f155c431e8dfef643195f2d636d4e4bb5 Mon Sep 17 00:00:00 2001 From: Alexander Pantyukhin Date: Sun, 8 Jan 2023 15:50:23 +0500 Subject: tools:cgroup:memcg_shrinker remove redundant import Remove redundant import of the sys module. Also use the sort function instead of sorted. It sorts the direct array without create the new one in memory. Link: https://lkml.kernel.org/r/20230108105023.4289-1-apantykhin@gmail.com Signed-off-by: Alexander Pantyukhin Cc: Roman Gushchin Signed-off-by: Andrew Morton --- tools/cgroup/memcg_shrinker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/cgroup/memcg_shrinker.py b/tools/cgroup/memcg_shrinker.py index 706ab27666a4..e81c3017ada9 100644 --- a/tools/cgroup/memcg_shrinker.py +++ b/tools/cgroup/memcg_shrinker.py @@ -5,7 +5,6 @@ import os import argparse -import sys def scan_cgroups(cgroup_root): @@ -44,7 +43,7 @@ def main(): cgroups = scan_cgroups("/sys/fs/cgroup/") shrinkers = scan_shrinkers("/sys/kernel/debug/shrinker/") - shrinkers = sorted(shrinkers, reverse = True, key = lambda x: x[0]) + shrinkers.sort(reverse = True, key = lambda x: x[0]) n = 0 for s in shrinkers: -- cgit v1.2.3 From 9a3f21fe5cb9f5654ccad7ba712d868f7de66e39 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 9 Jan 2023 12:42:51 +0100 Subject: selftests: vm: enable cross-compilation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Selftests vm builds break when doing cross-compilation. The Makefile MACHINE variable incorrectly picks upp the host machine architecture. If the CROSS_COMPILE variable is set, dig out the target host architecture from CROSS_COMPILE, instead of calling uname. Link: https://lkml.kernel.org/r/20230109114251.3349638-1-bjorn@kernel.org Signed-off-by: Björn Töpel Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 6a4b639b2b2b..0a44d77f8437 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -5,7 +5,11 @@ LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h include local_config.mk +ifeq ($(CROSS_COMPILE),) uname_M := $(shell uname -m 2>/dev/null || echo not) +else +uname_M := $(shell echo $(CROSS_COMPILE) | grep -o '^[a-z0-9]\+') +endif MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') # Without this, failed build products remain, with up-to-date timestamps, -- cgit v1.2.3 From 16ddcb15497e11a2695c604357e77140010d3d51 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Jan 2023 19:03:59 +0000 Subject: selftests/damon/sysfs: hide expected write failures DAMON selftests for sysfs (sysfs.sh) tests if some writes to DAMON sysfs interface files fails as expected. It makes the test results noisy with the failure error message because it tests a number of such failures. Redirect the expected failure error messages to /dev/null to make the results clean. Link: https://lkml.kernel.org/r/20230110190400.119388-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index a00336ffdcad..bcd4734ca094 100644 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -24,7 +24,7 @@ ensure_write_fail() content=$2 reason=$3 - if echo "$content" > "$file" + if (echo "$content" > "$file") 2> /dev/null then echo "writing $content to $file succeed ($fail_reason)" echo "expected failure because $reason" -- cgit v1.2.3 From 75cb348714f527ce2de3446202b76ce74808b668 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Jan 2023 19:04:00 +0000 Subject: selftests/damon/debugfs_rm_non_contexts: hide expected write error messages A selftest case for DAMON debugfs interface has a test for expected failure. To make the test output clean, hide the expected failure error message. Link: https://lkml.kernel.org/r/20230110190400.119388-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/debugfs_rm_non_contexts.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh index 48b7af6b022c..f3ffeb1343cf 100644 --- a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh +++ b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh @@ -10,7 +10,7 @@ dmesg -C for file in "$DBGFS/"* do - echo "$(basename "$f")" > "$DBGFS/rm_contexts" + (echo "$(basename "$f")" > "$DBGFS/rm_contexts") &> /dev/null if dmesg | grep -q BUG then dmesg -- cgit v1.2.3 From c5d5546ea06512accc894cd19265c7041a6ac81a Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Tue, 10 Jan 2023 23:42:11 +0800 Subject: maple_tree: remove the parameter entry of mas_preallocate The parameter entry of mas_preallocate is not used, so drop it. Link: https://lkml.kernel.org/r/20230110154211.1758562-1-vernon2gm@gmail.com Signed-off-by: Vernon Yang Cc: Liam Howlett Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 +- lib/maple_tree.c | 3 +-- mm/mmap.c | 16 ++++++++-------- mm/nommu.c | 8 ++++---- tools/testing/radix-tree/maple.c | 32 ++++++++++++++++---------------- 5 files changed, 30 insertions(+), 31 deletions(-) (limited to 'tools') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 815a27661517..a7bf58fd7cc6 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -455,7 +455,7 @@ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); void mas_store_prealloc(struct ma_state *mas, void *entry); void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); -int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); +int mas_preallocate(struct ma_state *mas, gfp_t gfp); bool mas_is_err(struct ma_state *mas); bool mas_nomem(struct ma_state *mas, gfp_t gfp); diff --git a/lib/maple_tree.c b/lib/maple_tree.c index baff62a012e1..5be99550e36d 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5700,12 +5700,11 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc); /** * mas_preallocate() - Preallocate enough nodes for a store operation * @mas: The maple state - * @entry: The entry that will be stored * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ -int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) +int mas_preallocate(struct ma_state *mas, gfp_t gfp) { int ret; diff --git a/mm/mmap.c b/mm/mmap.c index 425a9349e610..4fe29b8f99b0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -472,7 +472,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) MA_STATE(mas, &mm->mm_mt, 0, 0); struct address_space *mapping = NULL; - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) return -ENOMEM; if (vma->vm_file) { @@ -538,7 +538,7 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, /* Only handles expanding */ VM_BUG_ON(vma->vm_start < start || vma->vm_end > end); - if (mas_preallocate(mas, vma, GFP_KERNEL)) + if (mas_preallocate(mas, GFP_KERNEL)) goto nomem; vma_adjust_trans_huge(vma, start, end, 0); @@ -712,7 +712,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) return -ENOMEM; vma_adjust_trans_huge(orig_vma, start, end, adjust_next); @@ -1938,7 +1938,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Check that both stack segments have the same anon_vma? */ } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ @@ -2019,7 +2019,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) return -ENOMEM; } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ @@ -2311,7 +2311,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); mt_set_external_lock(&mt_detach, &mm->mmap_lock); - if (mas_preallocate(mas, vma, GFP_KERNEL)) + if (mas_preallocate(mas, GFP_KERNEL)) return -ENOMEM; mas->last = end - 1; @@ -2680,7 +2680,7 @@ cannot_expand: goto free_vma; } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (mas_preallocate(&mas, GFP_KERNEL)) { error = -ENOMEM; if (file) goto close_and_free_vma; @@ -2953,7 +2953,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, can_vma_merge_after(vma, flags, NULL, NULL, addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { mas_set_range(mas, vma->vm_start, addr + len - 1); - if (mas_preallocate(mas, vma, GFP_KERNEL)) + if (mas_preallocate(mas, GFP_KERNEL)) goto unacct_fail; vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); diff --git a/mm/nommu.c b/mm/nommu.c index df1711acdf5b..0481922fe66e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -602,7 +602,7 @@ static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (mas_preallocate(&mas, GFP_KERNEL)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); return -ENOMEM; @@ -633,7 +633,7 @@ static int delete_vma_from_mm(struct vm_area_struct *vma) { MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (mas_preallocate(&mas, GFP_KERNEL)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); return -ENOMEM; @@ -1091,7 +1091,7 @@ unsigned long do_mmap(struct file *file, if (!vma) goto error_getting_vma; - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) goto error_maple_preallocate; region->vm_usage = 1; @@ -1369,7 +1369,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!new) goto err_vma_dup; - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (mas_preallocate(&mas, GFP_KERNEL)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); goto err_mas_preallocate; diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 1f36bc1c5d36..958ee9bdb316 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35342,7 +35342,7 @@ static noinline void check_prealloc(struct maple_tree *mt) for (i = 0; i <= max; i++) mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35351,18 +35351,18 @@ static noinline void check_prealloc(struct maple_tree *mt) allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35370,25 +35370,25 @@ static noinline void check_prealloc(struct maple_tree *mt) mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); ma_free_rcu(mn); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); ma_free_rcu(mn); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35397,12 +35397,12 @@ static noinline void check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); mas_push_node(&mas, mn); MT_BUG_ON(mt, mas_allocated(&mas) != allocated); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35410,21 +35410,21 @@ static noinline void check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mas_store_prealloc(&mas, ptr); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35432,14 +35432,14 @@ static noinline void check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mt_set_non_kernel(1); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL & GFP_NOWAIT) == 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated != 0); mas_destroy(&mas); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35447,7 +35447,7 @@ static noinline void check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mt_set_non_kernel(1); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL & GFP_NOWAIT) == 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated != 0); -- cgit v1.2.3 From 2973d8229b78d3f148e0c45916a1e8b237dc6167 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 13 Jan 2023 11:12:17 +0000 Subject: mm: discard __GFP_ATOMIC __GFP_ATOMIC serves little purpose. Its main effect is to set ALLOC_HARDER which adds a few little boosts to increase the chance of an allocation succeeding, one of which is to lower the water-mark at which it will succeed. It is *always* paired with __GFP_HIGH which sets ALLOC_HIGH which also adjusts this watermark. It is probable that other users of __GFP_HIGH should benefit from the other little bonuses that __GFP_ATOMIC gets. __GFP_ATOMIC also gives a warning if used with __GFP_DIRECT_RECLAIM. There is little point to this. We already get a might_sleep() warning if __GFP_DIRECT_RECLAIM is set. __GFP_ATOMIC allows the "watermark_boost" to be side-stepped. It is probable that testing ALLOC_HARDER is a better fit here. __GFP_ATOMIC is used by tegra-smmu.c to check if the allocation might sleep. This should test __GFP_DIRECT_RECLAIM instead. This patch: - removes __GFP_ATOMIC - allows __GFP_HIGH allocations to ignore watermark boosting as well as GFP_ATOMIC requests. - makes other adjustments as suggested by the above. The net result is not change to GFP_ATOMIC allocations. Other allocations that use __GFP_HIGH will benefit from a few different extra privileges. This affects: xen, dm, md, ntfs3 the vermillion frame buffer hibernation ksm swap all of which likely produce more benefit than cost if these selected allocation are more likely to succeed quickly. [mgorman: Minor adjustments to rework on top of a series] Link: https://lkml.kernel.org/r/163712397076.13692.4727608274002939094@noble.neil.brown.name Link: https://lkml.kernel.org/r/20230113111217.14134-7-mgorman@techsingularity.net Signed-off-by: NeilBrown Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: Thierry Reding Signed-off-by: Andrew Morton --- Documentation/mm/balance.rst | 2 +- drivers/iommu/tegra-smmu.c | 4 ++-- include/linux/gfp_types.h | 12 ++++-------- include/trace/events/mmflags.h | 1 - lib/test_printf.c | 8 ++++---- mm/internal.h | 2 +- mm/page_alloc.c | 13 +++---------- tools/perf/builtin-kmem.c | 1 - 8 files changed, 15 insertions(+), 28 deletions(-) (limited to 'tools') diff --git a/Documentation/mm/balance.rst b/Documentation/mm/balance.rst index 6a1fadf3e173..e38e9d83c1c7 100644 --- a/Documentation/mm/balance.rst +++ b/Documentation/mm/balance.rst @@ -6,7 +6,7 @@ Memory Balancing Started Jan 2000 by Kanoj Sarcar -Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +Memory balancing is needed for !__GFP_HIGH and !__GFP_KSWAPD_RECLAIM as well as for non __GFP_IO allocations. The first reason why a caller may avoid reclaim is that the caller can not diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 5b1af40221ec..af8d0e685260 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -671,12 +671,12 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as, * allocate page in a sleeping context if GFP flags permit. Hence * spinlock needs to be unlocked and re-locked after allocation. */ - if (!(gfp & __GFP_ATOMIC)) + if (gfpflags_allow_blocking(gfp)) spin_unlock_irqrestore(&as->lock, *flags); page = alloc_page(gfp | __GFP_DMA | __GFP_ZERO); - if (!(gfp & __GFP_ATOMIC)) + if (gfpflags_allow_blocking(gfp)) spin_lock_irqsave(&as->lock, *flags); /* diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index d88c46ca82e1..5088637fe5c2 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -31,7 +31,7 @@ typedef unsigned int __bitwise gfp_t; #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u #define ___GFP_ZERO 0x100u -#define ___GFP_ATOMIC 0x200u +/* 0x200u unused */ #define ___GFP_DIRECT_RECLAIM 0x400u #define ___GFP_KSWAPD_RECLAIM 0x800u #define ___GFP_WRITE 0x1000u @@ -116,11 +116,8 @@ typedef unsigned int __bitwise gfp_t; * * %__GFP_HIGH indicates that the caller is high-priority and that granting * the request is necessary before the system can make forward progress. - * For example, creating an IO context to clean pages. - * - * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is - * high priority. Users are typically interrupt handlers. This may be - * used in conjunction with %__GFP_HIGH + * For example creating an IO context to clean pages and requests + * from atomic context. * * %__GFP_MEMALLOC allows access to all memory. This should only be used when * the caller guarantees the allocation will allow more memory to be freed @@ -135,7 +132,6 @@ typedef unsigned int __bitwise gfp_t; * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the %__GFP_MEMALLOC flag if both are set. */ -#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) #define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) @@ -329,7 +325,7 @@ typedef unsigned int __bitwise gfp_t; * version does not attempt reclaim/compaction at all and is by default used * in page fault path, while the non-light is used by khugepaged. */ -#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 412b5a46374c..9db52bc4ce19 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -31,7 +31,6 @@ gfpflag_string(__GFP_HIGHMEM), \ gfpflag_string(GFP_DMA32), \ gfpflag_string(__GFP_HIGH), \ - gfpflag_string(__GFP_ATOMIC), \ gfpflag_string(__GFP_IO), \ gfpflag_string(__GFP_FS), \ gfpflag_string(__GFP_NOWARN), \ diff --git a/lib/test_printf.c b/lib/test_printf.c index d34dc636b81c..46b4e6c414a3 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -674,17 +674,17 @@ flags(void) gfp = GFP_ATOMIC|__GFP_DMA; test("GFP_ATOMIC|GFP_DMA", "%pGg", &gfp); - gfp = __GFP_ATOMIC; - test("__GFP_ATOMIC", "%pGg", &gfp); + gfp = __GFP_HIGH; + test("__GFP_HIGH", "%pGg", &gfp); /* Any flags not translated by the table should remain numeric */ gfp = ~__GFP_BITS_MASK; snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp); test(cmp_buffer, "%pGg", &gfp); - snprintf(cmp_buffer, BUF_SIZE, "__GFP_ATOMIC|%#lx", + snprintf(cmp_buffer, BUF_SIZE, "__GFP_HIGH|%#lx", (unsigned long) gfp); - gfp |= __GFP_ATOMIC; + gfp |= __GFP_HIGH; test(cmp_buffer, "%pGg", &gfp); kfree(cmp_buffer); diff --git a/mm/internal.h b/mm/internal.h index b0b88a95347f..2d09a7a0600a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -24,7 +24,7 @@ struct folio_batch; #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC|__GFP_NOLOCKDEP) + __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 18ca33a1945d..0cfad30fb44c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4105,13 +4105,14 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, free_pages)) return true; + /* - * Ignore watermark boosting for GFP_ATOMIC order-0 allocations + * Ignore watermark boosting for __GFP_HIGH order-0 allocations * when checking the min watermark. The min watermark is the * point where boosting is ignored so that kswapd is woken up * when below the low watermark. */ - if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost + if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { mark = z->_watermark[WMARK_MIN]; return __zone_watermark_ok(z, order, mark, highest_zoneidx, @@ -5076,14 +5077,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int zonelist_iter_cookie; int reserve_flags; - /* - * We also sanity check to catch abuse of atomic reserves being used by - * callers that are not in atomic context. - */ - if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == - (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) - gfp_mask &= ~__GFP_ATOMIC; - restart: compaction_retries = 0; no_progress_loops = 0; diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 8ae0a1535293..f3029742b800 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -653,7 +653,6 @@ static const struct { { "__GFP_HIGHMEM", "HM" }, { "GFP_DMA32", "D32" }, { "__GFP_HIGH", "H" }, - { "__GFP_ATOMIC", "_A" }, { "__GFP_IO", "I" }, { "__GFP_FS", "F" }, { "__GFP_NOWARN", "NWR" }, -- cgit v1.2.3 From e6d2c436ff693869e83a65e61643b922e193e162 Mon Sep 17 00:00:00 2001 From: "Herton R. Krzesinski" Date: Mon, 16 Jan 2023 19:49:21 -0300 Subject: tools/mm: allow users to provide additional cflags/ldflags Right now there is no way to provide additional cflags/ldflags when building tools/vm binaries. And using eg. make CFLAGS= will override the CFLAGS being set in the Makefile, making the build fail since it requires the include of the ../lib dir (for libapi). This change then allows you to specify: CFLAGS= LDFLAGS= make V=1 -C tools/vm And the options will be correctly appended as can be seen from the make output. Link: https://lkml.kernel.org/r/20230116224921.4106324-1-herton@redhat.com Signed-off-by: Herton R. Krzesinski Cc: Don Zickus Cc: Justin Forbes Cc: Vlastimil Babka Cc: Scott Weaver Signed-off-by: Andrew Morton --- tools/mm/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/mm/Makefile b/tools/mm/Makefile index 9860622cbb15..6c1da51f4177 100644 --- a/tools/mm/Makefile +++ b/tools/mm/Makefile @@ -8,8 +8,8 @@ TARGETS=page-types slabinfo page_owner_sort LIB_DIR = ../lib/api LIBS = $(LIB_DIR)/libapi.a -CFLAGS = -Wall -Wextra -I../lib/ -LDFLAGS = $(LIBS) +CFLAGS += -Wall -Wextra -I../lib/ +LDFLAGS += $(LIBS) all: $(TARGETS) -- cgit v1.2.3 From 4cf1fe34fd18b752ae2403927277715d4444f331 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 19 Jan 2023 16:03:44 +0000 Subject: kselftest: vm: add tests for memory-deny-write-execute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add some tests to cover the new PR_SET_MDWE prctl. Link: https://lkml.kernel.org/r/20230119160344.54358-3-joey.gouly@arm.com Co-developed-by: Joey Gouly Signed-off-by: Joey Gouly Signed-off-by: Kees Cook Cc: Shuah Khan Cc: Alexander Viro Cc: Catalin Marinas Cc: Jeremy Linton Cc: Lennart Poettering Cc: Mark Brown Cc: nd Cc: Szabolcs Nagy Cc: Topi Miettinen Cc: Zbigniew Jędrzejewski-Szmek Cc: David Hildenbrand Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 1 + tools/testing/selftests/mm/mdwe_test.c | 197 +++++++++++++++++++++++++++++++++ 2 files changed, 198 insertions(+) create mode 100644 tools/testing/selftests/mm/mdwe_test.c (limited to 'tools') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 0a44d77f8437..d90cdc06aa59 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -60,6 +60,7 @@ TEST_GEN_PROGS += soft-dirty TEST_GEN_PROGS += split_huge_page_test TEST_GEN_FILES += ksm_tests TEST_GEN_PROGS += ksm_functional_tests +TEST_GEN_PROGS += mdwe_test ifeq ($(MACHINE),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32) diff --git a/tools/testing/selftests/mm/mdwe_test.c b/tools/testing/selftests/mm/mdwe_test.c new file mode 100644 index 000000000000..f466a099f1bf --- /dev/null +++ b/tools/testing/selftests/mm/mdwe_test.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifdef __aarch64__ +#include +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +#ifndef __aarch64__ +# define PROT_BTI 0 +#endif + +TEST(prctl_flags) +{ + EXPECT_LT(prctl(PR_SET_MDWE, 7L, 0L, 0L, 0L), 0); + EXPECT_LT(prctl(PR_SET_MDWE, 0L, 7L, 0L, 0L), 0); + EXPECT_LT(prctl(PR_SET_MDWE, 0L, 0L, 7L, 0L), 0); + EXPECT_LT(prctl(PR_SET_MDWE, 0L, 0L, 0L, 7L), 0); + + EXPECT_LT(prctl(PR_GET_MDWE, 7L, 0L, 0L, 0L), 0); + EXPECT_LT(prctl(PR_GET_MDWE, 0L, 7L, 0L, 0L), 0); + EXPECT_LT(prctl(PR_GET_MDWE, 0L, 0L, 7L, 0L), 0); + EXPECT_LT(prctl(PR_GET_MDWE, 0L, 0L, 0L, 7L), 0); +} + +FIXTURE(mdwe) +{ + void *p; + int flags; + size_t size; + pid_t pid; +}; + +FIXTURE_VARIANT(mdwe) +{ + bool enabled; + bool forked; +}; + +FIXTURE_VARIANT_ADD(mdwe, stock) +{ + .enabled = false, + .forked = false, +}; + +FIXTURE_VARIANT_ADD(mdwe, enabled) +{ + .enabled = true, + .forked = false, +}; + +FIXTURE_VARIANT_ADD(mdwe, forked) +{ + .enabled = true, + .forked = true, +}; + +FIXTURE_SETUP(mdwe) +{ + int ret, status; + + self->p = NULL; + self->flags = MAP_SHARED | MAP_ANONYMOUS; + self->size = getpagesize(); + + if (!variant->enabled) + return; + + ret = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0L, 0L, 0L); + ASSERT_EQ(ret, 0) { + TH_LOG("PR_SET_MDWE failed or unsupported"); + } + + ret = prctl(PR_GET_MDWE, 0L, 0L, 0L, 0L); + ASSERT_EQ(ret, 1); + + if (variant->forked) { + self->pid = fork(); + ASSERT_GE(self->pid, 0) { + TH_LOG("fork failed\n"); + } + + if (self->pid > 0) { + ret = waitpid(self->pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + exit(WEXITSTATUS(status)); + } + } +} + +FIXTURE_TEARDOWN(mdwe) +{ + if (self->p && self->p != MAP_FAILED) + munmap(self->p, self->size); +} + +TEST_F(mdwe, mmap_READ_EXEC) +{ + self->p = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0); + EXPECT_NE(self->p, MAP_FAILED); +} + +TEST_F(mdwe, mmap_WRITE_EXEC) +{ + self->p = mmap(NULL, self->size, PROT_WRITE | PROT_EXEC, self->flags, 0, 0); + if (variant->enabled) { + EXPECT_EQ(self->p, MAP_FAILED); + } else { + EXPECT_NE(self->p, MAP_FAILED); + } +} + +TEST_F(mdwe, mprotect_stay_EXEC) +{ + int ret; + + self->p = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_READ | PROT_EXEC); + EXPECT_EQ(ret, 0); +} + +TEST_F(mdwe, mprotect_add_EXEC) +{ + int ret; + + self->p = mmap(NULL, self->size, PROT_READ, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_READ | PROT_EXEC); + if (variant->enabled) { + EXPECT_LT(ret, 0); + } else { + EXPECT_EQ(ret, 0); + } +} + +TEST_F(mdwe, mprotect_WRITE_EXEC) +{ + int ret; + + self->p = mmap(NULL, self->size, PROT_WRITE, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_WRITE | PROT_EXEC); + if (variant->enabled) { + EXPECT_LT(ret, 0); + } else { + EXPECT_EQ(ret, 0); + } +} + +TEST_F(mdwe, mmap_FIXED) +{ + void *p, *p2; + + p2 = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0); + self->p = mmap(NULL, self->size, PROT_READ, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + p = mmap(self->p + self->size, self->size, PROT_READ | PROT_EXEC, + self->flags | MAP_FIXED, 0, 0); + if (variant->enabled) { + EXPECT_EQ(p, MAP_FAILED); + } else { + EXPECT_EQ(p, self->p); + } +} + +TEST_F(mdwe, arm64_BTI) +{ + int ret; + +#ifdef __aarch64__ + if (!(getauxval(AT_HWCAP2) & HWCAP2_BTI)) +#endif + SKIP(return, "HWCAP2_BTI not supported"); + + self->p = mmap(NULL, self->size, PROT_EXEC, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_EXEC | PROT_BTI); + EXPECT_EQ(ret, 0); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From d5d469247264e56960705dc5ae7e1d014861fe40 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 15 Feb 2023 14:00:58 +0100 Subject: objtool: add UACCESS exceptions for __tsan_volatile_read/write A lot of the tsan helpers are already excempt from the UACCESS warnings, but some more functions were added that need the same thing: kernel/kcsan/core.o: warning: objtool: __tsan_volatile_read16+0x0: call to __tsan_unaligned_read16() with UACCESS enabled kernel/kcsan/core.o: warning: objtool: __tsan_volatile_write16+0x0: call to __tsan_unaligned_write16() with UACCESS enabled vmlinux.o: warning: objtool: __tsan_unaligned_volatile_read16+0x4: call to __tsan_unaligned_read16() with UACCESS enabled vmlinux.o: warning: objtool: __tsan_unaligned_volatile_write16+0x4: call to __tsan_unaligned_write16() with UACCESS enabled As Marco points out, these functions don't even call each other explicitly but instead gcc (but not clang) notices the functions being identical and turns one symbol into a direct branch to the other. Link: https://lkml.kernel.org/r/20230215130058.3836177-4-arnd@kernel.org Fixes: 75d75b7a4d54 ("kcsan: Support distinguishing volatile accesses") Signed-off-by: Arnd Bergmann Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Josh Poimboeuf Cc: Kuan-Ying Lee Cc: Peter Zijlstra (Intel) Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- tools/objtool/check.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 4b7c8b33069e..b1a5f658673f 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1186,6 +1186,8 @@ static const char *uaccess_safe_builtin[] = { "__tsan_atomic64_compare_exchange_val", "__tsan_atomic_thread_fence", "__tsan_atomic_signal_fence", + "__tsan_unaligned_read16", + "__tsan_unaligned_write16", /* KCOV */ "write_comp_data", "check_kcov_mode", -- cgit v1.2.3