diff --git a/tools/mm/Makefile b/tools/mm/Makefile index 1c5606cc3334..7bb03606b9ea 100644 --- a/tools/mm/Makefile +++ b/tools/mm/Makefile @@ -3,7 +3,8 @@ # include ../scripts/Makefile.include -TARGETS=page-types slabinfo page_owner_sort +BUILD_TARGETS=page-types slabinfo page_owner_sort +INSTALL_TARGETS = $(BUILD_TARGETS) thpmaps LIB_DIR = ../lib/api LIBS = $(LIB_DIR)/libapi.a @@ -11,9 +12,9 @@ LIBS = $(LIB_DIR)/libapi.a CFLAGS += -Wall -Wextra -I../lib/ -pthread LDFLAGS += $(LIBS) -pthread -all: $(TARGETS) +all: $(BUILD_TARGETS) -$(TARGETS): $(LIBS) +$(BUILD_TARGETS): $(LIBS) $(LIBS): make -C $(LIB_DIR) @@ -29,4 +30,4 @@ sbindir ?= /usr/sbin install: all install -d $(DESTDIR)$(sbindir) - install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir) + install -m 755 -p $(INSTALL_TARGETS) $(DESTDIR)$(sbindir) diff --git a/tools/mm/thpmaps b/tools/mm/thpmaps new file mode 100644 index 000000000000..803e0318f2fe --- /dev/null +++ b/tools/mm/thpmaps @@ -0,0 +1,675 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0-only +# Copyright (C) 2024 ARM Ltd. +# +# Utility providing smaps-like output detailing transparent hugepage usage. +# For more info, run: +# ./thpmaps --help +# +# Requires numpy: +# pip3 install numpy + + +import argparse +import collections +import math +import os +import re +import resource +import shutil +import sys +import textwrap +import time +import numpy as np + + +with open('/sys/kernel/mm/transparent_hugepage/hpage_pmd_size') as f: + PAGE_SIZE = resource.getpagesize() + PAGE_SHIFT = int(math.log2(PAGE_SIZE)) + PMD_SIZE = int(f.read()) + PMD_ORDER = int(math.log2(PMD_SIZE / PAGE_SIZE)) + + +def align_forward(v, a): + return (v + (a - 1)) & ~(a - 1) + + +def align_offset(v, a): + return v & (a - 1) + + +def kbnr(kb): + # Convert KB to number of pages. + return (kb << 10) >> PAGE_SHIFT + + +def nrkb(nr): + # Convert number of pages to KB. + return (nr << PAGE_SHIFT) >> 10 + + +def odkb(order): + # Convert page order to KB. + return (PAGE_SIZE << order) >> 10 + + +def cont_ranges_all(search, index): + # Given a list of arrays, find the ranges for which values are monotonically + # incrementing in all arrays. all arrays in search and index must be the + # same size. + sz = len(search[0]) + r = np.full(sz, 2) + d = np.diff(search[0]) == 1 + for dd in [np.diff(arr) == 1 for arr in search[1:]]: + d &= dd + r[1:] -= d + r[:-1] -= d + return [np.repeat(arr, r).reshape(-1, 2) for arr in index] + + +class ArgException(Exception): + pass + + +class FileIOException(Exception): + pass + + +class BinArrayFile: + # Base class used to read /proc//pagemap and /proc/kpageflags into a + # numpy array. Use inherrited class in a with clause to ensure file is + # closed when it goes out of scope. + def __init__(self, filename, element_size): + self.element_size = element_size + self.filename = filename + self.fd = os.open(self.filename, os.O_RDONLY) + + def cleanup(self): + os.close(self.fd) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.cleanup() + + def _readin(self, offset, buffer): + length = os.preadv(self.fd, (buffer,), offset) + if len(buffer) != length: + raise FileIOException('error: {} failed to read {} bytes at {:x}' + .format(self.filename, len(buffer), offset)) + + def _toarray(self, buf): + assert(self.element_size == 8) + return np.frombuffer(buf, dtype=np.uint64) + + def getv(self, vec): + vec *= self.element_size + offsets = vec[:, 0] + lengths = (np.diff(vec) + self.element_size).reshape(len(vec)) + buf = bytearray(int(np.sum(lengths))) + view = memoryview(buf) + pos = 0 + for offset, length in zip(offsets, lengths): + offset = int(offset) + length = int(length) + self._readin(offset, view[pos:pos+length]) + pos += length + return self._toarray(buf) + + def get(self, index, nr=1): + offset = index * self.element_size + length = nr * self.element_size + buf = bytearray(length) + self._readin(offset, buf) + return self._toarray(buf) + + +PM_PAGE_PRESENT = 1 << 63 +PM_PFN_MASK = (1 << 55) - 1 + +class PageMap(BinArrayFile): + # Read ranges of a given pid's pagemap into a numpy array. + def __init__(self, pid='self'): + super().__init__(f'/proc/{pid}/pagemap', 8) + + +KPF_ANON = 1 << 12 +KPF_COMPOUND_HEAD = 1 << 15 +KPF_COMPOUND_TAIL = 1 << 16 +KPF_THP = 1 << 22 + +class KPageFlags(BinArrayFile): + # Read ranges of /proc/kpageflags into a numpy array. + def __init__(self): + super().__init__(f'/proc/kpageflags', 8) + + +vma_all_stats = set([ + "Size", + "Rss", + "Pss", + "Pss_Dirty", + "Shared_Clean", + "Shared_Dirty", + "Private_Clean", + "Private_Dirty", + "Referenced", + "Anonymous", + "KSM", + "LazyFree", + "AnonHugePages", + "ShmemPmdMapped", + "FilePmdMapped", + "Shared_Hugetlb", + "Private_Hugetlb", + "Swap", + "SwapPss", + "Locked", +]) + +vma_min_stats = set([ + "Rss", + "Anonymous", + "AnonHugePages", + "ShmemPmdMapped", + "FilePmdMapped", +]) + +VMA = collections.namedtuple('VMA', [ + 'name', + 'start', + 'end', + 'read', + 'write', + 'execute', + 'private', + 'pgoff', + 'major', + 'minor', + 'inode', + 'stats', +]) + +class VMAList: + # A container for VMAs, parsed from /proc//smaps. Iterate over the + # instance to receive VMAs. + def __init__(self, pid='self', stats=[]): + self.vmas = [] + with open(f'/proc/{pid}/smaps', 'r') as file: + for line in file: + elements = line.split() + if '-' in elements[0]: + start, end = map(lambda x: int(x, 16), elements[0].split('-')) + major, minor = map(lambda x: int(x, 16), elements[3].split(':')) + self.vmas.append(VMA( + name=elements[5] if len(elements) == 6 else '', + start=start, + end=end, + read=elements[1][0] == 'r', + write=elements[1][1] == 'w', + execute=elements[1][2] == 'x', + private=elements[1][3] == 'p', + pgoff=int(elements[2], 16), + major=major, + minor=minor, + inode=int(elements[4], 16), + stats={}, + )) + else: + param = elements[0][:-1] + if param in stats: + value = int(elements[1]) + self.vmas[-1].stats[param] = {'type': None, 'value': value} + + def __iter__(self): + yield from self.vmas + + +def thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads): + # Given 4 same-sized arrays representing a range within a page table backed + # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons: + # True if page is anonymous, heads: True if page is head of a THP), return a + # dictionary of statistics describing the mapped THPs. + stats = { + 'file': { + 'partial': 0, + 'aligned': [0] * (PMD_ORDER + 1), + 'unaligned': [0] * (PMD_ORDER + 1), + }, + 'anon': { + 'partial': 0, + 'aligned': [0] * (PMD_ORDER + 1), + 'unaligned': [0] * (PMD_ORDER + 1), + }, + } + + for rindex, rpfn in zip(ranges[0], ranges[2]): + index_next = int(rindex[0]) + index_end = int(rindex[1]) + 1 + pfn_end = int(rpfn[1]) + 1 + + folios = indexes[index_next:index_end][heads[index_next:index_end]] + + # Account pages for any partially mapped THP at the front. In that case, + # the first page of the range is a tail. + nr = (int(folios[0]) if len(folios) else index_end) - index_next + stats['anon' if anons[index_next] else 'file']['partial'] += nr + + # Account pages for any partially mapped THP at the back. In that case, + # the next page after the range is a tail. + if len(folios): + flags = int(kpageflags.get(pfn_end)[0]) + if flags & KPF_COMPOUND_TAIL: + nr = index_end - int(folios[-1]) + folios = folios[:-1] + index_end -= nr + stats['anon' if anons[index_end - 1] else 'file']['partial'] += nr + + # Account fully mapped THPs in the middle of the range. + if len(folios): + folio_nrs = np.append(np.diff(folios), np.uint64(index_end - folios[-1])) + folio_orders = np.log2(folio_nrs).astype(np.uint64) + for index, order in zip(folios, folio_orders): + index = int(index) + order = int(order) + nr = 1 << order + vfn = int(vfns[index]) + align = 'aligned' if align_forward(vfn, nr) == vfn else 'unaligned' + anon = 'anon' if anons[index] else 'file' + stats[anon][align][order] += nr + + # Account PMD-mapped THPs spearately, so filter out of the stats. There is a + # race between acquiring the smaps stats and reading pagemap, where memory + # could be deallocated. So clamp to zero incase it would have gone negative. + anon_pmd_mapped = vma.stats['AnonHugePages']['value'] + file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \ + vma.stats['FilePmdMapped']['value'] + stats['anon']['aligned'][PMD_ORDER] = max(0, stats['anon']['aligned'][PMD_ORDER] - kbnr(anon_pmd_mapped)) + stats['file']['aligned'][PMD_ORDER] = max(0, stats['file']['aligned'][PMD_ORDER] - kbnr(file_pmd_mapped)) + + rstats = { + f"anon-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'anon', 'value': anon_pmd_mapped}, + f"file-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'file', 'value': file_pmd_mapped}, + } + + def flatten_sub(type, subtype, stats): + param = f"{type}-thp-pte-{subtype}-{{}}kB" + for od, nr in enumerate(stats[2:], 2): + rstats[param.format(odkb(od))] = {'type': type, 'value': nrkb(nr)} + + def flatten_type(type, stats): + flatten_sub(type, 'aligned', stats['aligned']) + flatten_sub(type, 'unaligned', stats['unaligned']) + rstats[f"{type}-thp-pte-partial"] = {'type': type, 'value': nrkb(stats['partial'])} + + flatten_type('anon', stats['anon']) + flatten_type('file', stats['file']) + + return rstats + + +def cont_parse(vma, order, ranges, anons, heads): + # Given 4 same-sized arrays representing a range within a page table backed + # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons: + # True if page is anonymous, heads: True if page is head of a THP), return a + # dictionary of statistics describing the contiguous blocks. + nr_cont = 1 << order + nr_anon = 0 + nr_file = 0 + + for rindex, rvfn, rpfn in zip(*ranges): + index_next = int(rindex[0]) + index_end = int(rindex[1]) + 1 + vfn_start = int(rvfn[0]) + pfn_start = int(rpfn[0]) + + if align_offset(pfn_start, nr_cont) != align_offset(vfn_start, nr_cont): + continue + + off = align_forward(vfn_start, nr_cont) - vfn_start + index_next += off + + while index_next + nr_cont <= index_end: + folio_boundary = heads[index_next+1:index_next+nr_cont].any() + if not folio_boundary: + if anons[index_next]: + nr_anon += nr_cont + else: + nr_file += nr_cont + index_next += nr_cont + + # Account blocks that are PMD-mapped spearately, so filter out of the stats. + # There is a race between acquiring the smaps stats and reading pagemap, + # where memory could be deallocated. So clamp to zero incase it would have + # gone negative. + anon_pmd_mapped = vma.stats['AnonHugePages']['value'] + file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \ + vma.stats['FilePmdMapped']['value'] + nr_anon = max(0, nr_anon - kbnr(anon_pmd_mapped)) + nr_file = max(0, nr_file - kbnr(file_pmd_mapped)) + + rstats = { + f"anon-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'anon', 'value': anon_pmd_mapped}, + f"file-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'file', 'value': file_pmd_mapped}, + } + + rstats[f"anon-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'anon', 'value': nrkb(nr_anon)} + rstats[f"file-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'file', 'value': nrkb(nr_file)} + + return rstats + + +def vma_print(vma, pid): + # Prints a VMA instance in a format similar to smaps. The main difference is + # that the pid is included as the first value. + print("{:010d}: {:016x}-{:016x} {}{}{}{} {:08x} {:02x}:{:02x} {:08x} {}" + .format( + pid, vma.start, vma.end, + 'r' if vma.read else '-', 'w' if vma.write else '-', + 'x' if vma.execute else '-', 'p' if vma.private else 's', + vma.pgoff, vma.major, vma.minor, vma.inode, vma.name + )) + + +def stats_print(stats, tot_anon, tot_file, inc_empty): + # Print a statistics dictionary. + label_field = 32 + for label, stat in stats.items(): + type = stat['type'] + value = stat['value'] + if value or inc_empty: + pad = max(0, label_field - len(label) - 1) + if type == 'anon' and tot_anon > 0: + percent = f' ({value / tot_anon:3.0%})' + elif type == 'file' and tot_file > 0: + percent = f' ({value / tot_file:3.0%})' + else: + percent = '' + print(f"{label}:{' ' * pad}{value:8} kB{percent}") + + +def vma_parse(vma, pagemap, kpageflags, contorders): + # Generate thp and cont statistics for a single VMA. + start = vma.start >> PAGE_SHIFT + end = vma.end >> PAGE_SHIFT + + pmes = pagemap.get(start, end - start) + present = pmes & PM_PAGE_PRESENT != 0 + pfns = pmes & PM_PFN_MASK + pfns = pfns[present] + vfns = np.arange(start, end, dtype=np.uint64) + vfns = vfns[present] + + pfn_vec = cont_ranges_all([pfns], [pfns])[0] + flags = kpageflags.getv(pfn_vec) + anons = flags & KPF_ANON != 0 + heads = flags & KPF_COMPOUND_HEAD != 0 + thps = flags & KPF_THP != 0 + + vfns = vfns[thps] + pfns = pfns[thps] + anons = anons[thps] + heads = heads[thps] + + indexes = np.arange(len(vfns), dtype=np.uint64) + ranges = cont_ranges_all([vfns, pfns], [indexes, vfns, pfns]) + + thpstats = thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads) + contstats = [cont_parse(vma, order, ranges, anons, heads) for order in contorders] + + tot_anon = vma.stats['Anonymous']['value'] + tot_file = vma.stats['Rss']['value'] - tot_anon + + return { + **thpstats, + **{k: v for s in contstats for k, v in s.items()} + }, tot_anon, tot_file + + +def do_main(args): + pids = set() + rollup = {} + rollup_anon = 0 + rollup_file = 0 + + if args.cgroup: + strict = False + for walk_info in os.walk(args.cgroup): + cgroup = walk_info[0] + with open(f'{cgroup}/cgroup.procs') as pidfile: + for line in pidfile.readlines(): + pids.add(int(line.strip())) + elif args.pid: + strict = True + pids = pids.union(args.pid) + else: + strict = False + for pid in os.listdir('/proc'): + if pid.isdigit(): + pids.add(int(pid)) + + if not args.rollup: + print(" PID START END PROT OFFSET DEV INODE OBJECT") + + for pid in pids: + try: + with PageMap(pid) as pagemap: + with KPageFlags() as kpageflags: + for vma in VMAList(pid, vma_all_stats if args.inc_smaps else vma_min_stats): + if (vma.read or vma.write or vma.execute) and vma.stats['Rss']['value'] > 0: + stats, vma_anon, vma_file = vma_parse(vma, pagemap, kpageflags, args.cont) + else: + stats = {} + vma_anon = 0 + vma_file = 0 + if args.inc_smaps: + stats = {**vma.stats, **stats} + if args.rollup: + for k, v in stats.items(): + if k in rollup: + assert(rollup[k]['type'] == v['type']) + rollup[k]['value'] += v['value'] + else: + rollup[k] = v + rollup_anon += vma_anon + rollup_file += vma_file + else: + vma_print(vma, pid) + stats_print(stats, vma_anon, vma_file, args.inc_empty) + except (FileNotFoundError, ProcessLookupError, FileIOException): + if strict: + raise + + if args.rollup: + stats_print(rollup, rollup_anon, rollup_file, args.inc_empty) + + +def main(): + docs_width = shutil.get_terminal_size().columns + docs_width -= 2 + docs_width = min(80, docs_width) + + def format(string): + text = re.sub(r'\s+', ' ', string) + text = re.sub(r'\s*\\n\s*', '\n', text) + paras = text.split('\n') + paras = [textwrap.fill(p, width=docs_width) for p in paras] + return '\n'.join(paras) + + def formatter(prog): + return argparse.RawDescriptionHelpFormatter(prog, width=docs_width) + + def size2order(human): + units = { + "K": 2**10, "M": 2**20, "G": 2**30, + "k": 2**10, "m": 2**20, "g": 2**30, + } + unit = 1 + if human[-1] in units: + unit = units[human[-1]] + human = human[:-1] + try: + size = int(human) + except ValueError: + raise ArgException('error: --cont value must be integer size with optional KMG unit') + size *= unit + order = int(math.log2(size / PAGE_SIZE)) + if order < 1: + raise ArgException('error: --cont value must be size of at least 2 pages') + if (1 << order) * PAGE_SIZE != size: + raise ArgException('error: --cont value must be size of power-of-2 pages') + if order > PMD_ORDER: + raise ArgException('error: --cont value must be less than or equal to PMD order') + return order + + parser = argparse.ArgumentParser(formatter_class=formatter, + description=format("""Prints information about how transparent huge + pages are mapped, either system-wide, or for a specified + process or cgroup.\\n + \\n + When run with --pid, the user explicitly specifies the set + of pids to scan. e.g. "--pid 10 [--pid 134 ...]". When run + with --cgroup, the user passes either a v1 or v2 cgroup and + all pids that belong to the cgroup subtree are scanned. When + run with neither --pid nor --cgroup, the full set of pids on + the system is gathered from /proc and scanned as if the user + had provided "--pid 1 --pid 2 ...".\\n + \\n + A default set of statistics is always generated for THP + mappings. However, it is also possible to generate + additional statistics for "contiguous block mappings" where + the block size is user-defined.\\n + \\n + Statistics are maintained independently for anonymous and + file-backed (pagecache) memory and are shown both in kB and + as a percentage of either total anonymous or total + file-backed memory as appropriate.\\n + \\n + THP Statistics\\n + --------------\\n + \\n + Statistics are always generated for fully- and + contiguously-mapped THPs whose mapping address is aligned to + their size, for each supported by the system. + Separate counters describe THPs mapped by PTE vs those + mapped by PMD. (Although note a THP can only be mapped by + PMD if it is PMD-sized):\\n + \\n + - anon-thp-pte-aligned-kB\\n + - file-thp-pte-aligned-kB\\n + - anon-thp-pmd-aligned-kB\\n + - file-thp-pmd-aligned-kB\\n + \\n + Similarly, statistics are always generated for fully- and + contiguously-mapped THPs whose mapping address is *not* + aligned to their size, for each supported by the + system. Due to the unaligned mapping, it is impossible to + map by PMD, so there are only PTE counters for this case:\\n + \\n + - anon-thp-pte-unaligned-kB\\n + - file-thp-pte-unaligned-kB\\n + \\n + Statistics are also always generated for mapped pages that + belong to a THP but where the is THP is *not* fully- and + contiguously- mapped. These "partial" mappings are all + counted in the same counter regardless of the size of the + THP that is partially mapped:\\n + \\n + - anon-thp-pte-partial\\n + - file-thp-pte-partial\\n + \\n + Contiguous Block Statistics\\n + ---------------------------\\n + \\n + An optional, additional set of statistics is generated for + every contiguous block size specified with `--cont `. + These statistics show how much memory is mapped in + contiguous blocks of and also aligned to . A + given contiguous block must all belong to the same THP, but + there is no requirement for it to be the *whole* THP. + Separate counters describe contiguous blocks mapped by PTE + vs those mapped by PMD:\\n + \\n + - anon-cont-pte-aligned-kB\\n + - file-cont-pte-aligned-kB\\n + - anon-cont-pmd-aligned-kB\\n + - file-cont-pmd-aligned-kB\\n + \\n + As an example, if monitoring 64K contiguous blocks (--cont + 64K), there are a number of sources that could provide such + blocks: a fully- and contiguously-mapped 64K THP that is + aligned to a 64K boundary would provide 1 block. A fully- + and contiguously-mapped 128K THP that is aligned to at least + a 64K boundary would provide 2 blocks. Or a 128K THP that + maps its first 100K, but contiguously and starting at a 64K + boundary would provide 1 block. A fully- and + contiguously-mapped 2M THP would provide 32 blocks. There + are many other possible permutations.\\n"""), + epilog=format("""Requires root privilege to access pagemap and + kpageflags.""")) + + group = parser.add_mutually_exclusive_group(required=False) + group.add_argument('--pid', + metavar='pid', required=False, type=int, default=[], action='append', + help="""Process id of the target process. Maybe issued multiple times to + scan multiple processes. --pid and --cgroup are mutually exclusive. + If neither are provided, all processes are scanned to provide + system-wide information.""") + + group.add_argument('--cgroup', + metavar='path', required=False, + help="""Path to the target cgroup in sysfs. Iterates over every pid in + the cgroup and its children. --pid and --cgroup are mutually + exclusive. If neither are provided, all processes are scanned to + provide system-wide information.""") + + parser.add_argument('--rollup', + required=False, default=False, action='store_true', + help="""Sum the per-vma statistics to provide a summary over the whole + system, process or cgroup.""") + + parser.add_argument('--cont', + metavar='size[KMG]', required=False, default=[], action='append', + help="""Adds stats for memory that is mapped in contiguous blocks of + and also aligned to . May be issued multiple times to + track multiple sized blocks. Useful to infer e.g. arm64 contpte and + hpa mappings. Size must be a power-of-2 number of pages.""") + + parser.add_argument('--inc-smaps', + required=False, default=False, action='store_true', + help="""Include all numerical, additive /proc//smaps stats in the + output.""") + + parser.add_argument('--inc-empty', + required=False, default=False, action='store_true', + help="""Show all statistics including those whose value is 0.""") + + parser.add_argument('--periodic', + metavar='sleep_ms', required=False, type=int, + help="""Run in a loop, polling every sleep_ms milliseconds.""") + + args = parser.parse_args() + + try: + args.cont = [size2order(cont) for cont in args.cont] + except ArgException as e: + parser.print_usage() + raise + + if args.periodic: + while True: + do_main(args) + print() + time.sleep(args.periodic / 1000) + else: + do_main(args) + + +if __name__ == "__main__": + try: + main() + except Exception as e: + prog = os.path.basename(sys.argv[0]) + print(f'{prog}: {e}') + exit(1)