zfs: merge openzfs/zfs@90ba19eb7

Notable upstream pull request merges: #15784 90ba19eb7b Do no use .cfi_negate_ra_state within the assembly on Arm64 #15942 b6bbaa8372 Give a better message from 'zpool get' with invalid pool name #15943 ca678bc0bc Makefile.bsd: sort and cleanup source file list #15953 fa480fe5ba zinject: show more device fault fields #15983 30c4eba4ea Fix panics when truncating/deleting files #15992 99741bde59 zvol: use multiple taskq #16015 a100a195fa Add support for zfs mount -R <filesystem> #16022 5e5fd0a178 Speculative prefetch for reordered requests #16040 997f85b4d3 L2ARC: Relax locking during write #16042 b12738182c Improve dbuf_read() error reporting #16051 a9a4290173 xdr: header cleanup #16052 eeca9a91d6 Fix read errors race after block cloning #16057 aa5445c28b Remove db_state DB_NOFILL checks from syncing context #16061 76d1dde94c zinject: inject device errors into ioctls #16072 9e63631dea Small fix to prefetch ranges aggregation #16077 44f337be30 Illumos#16463 zfs_ioc_recv leaks nvlist #16085 4725e543be zinject: "no-op" error injection #16086 c6da985e28 Add the BTI elf note to the AArch64 SHA2 assembly Obtained from: OpenZFS OpenZFS commit: 90ba19eb7b
2024-07-22 10:48:02 +00:00 · 2024-04-16 22:52:34 +02:00 · 2024-04-16 22:52:34 +02:00 · 1719886f6d
parent e4a0c92e7a 90ba19eb7b
commit 1719886f6d
83 changed files with 2483 additions and 872 deletions
--- a/cddl/lib/libnvpair/Makefile
+++ b/cddl/lib/libnvpair/Makefile
@ -27,6 +27,4 @@ CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
 CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
 CFLAGS+= -DHAVE_ISSETUGID  -DHAVE_CONFIG_H -DHAVE_XDR_BYTESREC

-
-CFLAGS.nvpair.c+= -UHAVE_RPC_TYPES
 .include <bsd.lib.mk>
--- a/sys/contrib/openzfs/.mailmap
+++ b/sys/contrib/openzfs/.mailmap
@ -30,6 +30,7 @@ Andreas Dilger <adilger@dilger.ca>
 Andrew Walker <awalker@ixsystems.com>
 Benedikt Neuffer <github@itfriend.de>
 Chengfei Zhu <chengfeix.zhu@intel.com>
+ChenHao Lu <18302010006@fudan.edu.cn>
 Chris Lindee <chris.lindee+github@gmail.com>
 Colm Buckley <colm@tuatha.org>
 Crag Wang <crag0715@gmail.com>
@ -43,6 +44,7 @@ Glenn Washburn <development@efficientek.com>
 Gordan Bobic <gordan.bobic@gmail.com>
 Gregory Bartholomew <gregory.lee.bartholomew@gmail.com>
 hedong zhang <h_d_zhang@163.com>
+Ilkka Sovanto <github@ilkka.kapsi.fi>
 InsanePrawn <Insane.Prawny@gmail.com>
 Jason Cohen <jwittlincohen@gmail.com>
 Jason Harmening <jason.harmening@gmail.com>
@ -57,6 +59,7 @@ KernelOfTruth <kerneloftruth@gmail.com>
 Liu Hua <liu.hua130@zte.com.cn>
 Liu Qing <winglq@gmail.com>
 loli10K <ezomori.nozomu@gmail.com>
+Mart Frauenlob <allkind@fastest.cc>
 Matthias Blankertz <matthias@blankertz.org>
 Michael Gmelin <grembo@FreeBSD.org>
 Olivier Mazouffre <olivier.mazouffre@ims-bordeaux.fr>
@ -73,6 +76,9 @@ WHR <msl0000023508@gmail.com>
 Yanping Gao <yanping.gao@xtaotech.com>
 Youzhong Yang <youzhong@gmail.com>

+# Signed-off-by: overriding Author:
+Yuxin Wang <yuxinwang9999@gmail.com> <Bi11gates9999@gmail.com>
+
 # Commits from strange places, long ago
 Brian Behlendorf <behlendorf1@llnl.gov> <behlendo@7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c>
 Brian Behlendorf <behlendorf1@llnl.gov> <behlendo@fedora-17-amd64.(none)>
@ -102,12 +108,15 @@ Brandon Thetford <brandon@dodecatec.com> <dodexahedron@users.noreply.github.com>
 buzzingwires <buzzingwires@outlook.com> <131118055+buzzingwires@users.noreply.github.com>
 Cedric Maunoury <cedric.maunoury@gmail.com> <38213715+cedricmaunoury@users.noreply.github.com>
 Charles Suh <charles.suh@gmail.com> <charlessuh@users.noreply.github.com>
+Chris Peredun <chris.peredun@ixsystems.com> <126915832+chrisperedun@users.noreply.github.com>
 Dacian Reece-Stremtan <dacianstremtan@gmail.com> <35844628+dacianstremtan@users.noreply.github.com>
 Damian Szuberski <szuberskidamian@gmail.com> <30863496+szubersk@users.noreply.github.com>
 Daniel Hiepler <d-git@coderdu.de> <32984777+heeplr@users.noreply.github.com>
 Daniel Kobras <d.kobras@science-computing.de> <sckobras@users.noreply.github.com>
 Daniel Reichelt <hacking@nachtgeist.net> <nachtgeist@users.noreply.github.com>
 David Quigley <david.quigley@intel.com> <dpquigl@users.noreply.github.com>
+Dennis R. Friedrichsen <dennis.r.friedrichsen@gmail.com> <31087738+dennisfriedrichsen@users.noreply.github.com>
+Dex Wood <slash2314@gmail.com> <slash2314@users.noreply.github.com>
 DHE <git@dehacked.net> <DeHackEd@users.noreply.github.com>
 Dmitri John Ledkov <dimitri.ledkov@canonical.com> <19779+xnox@users.noreply.github.com>
 Dries Michiels <driesm.michiels@gmail.com> <32487486+driesmp@users.noreply.github.com>
@ -128,6 +137,7 @@ Harry Mallon <hjmallon@gmail.com> <1816667+hjmallon@users.noreply.github.com>
 Hiếu Lê <leorize+oss@disroot.org> <alaviss@users.noreply.github.com>
 Jake Howard <git@theorangeone.net> <RealOrangeOne@users.noreply.github.com>
 James Cowgill <james.cowgill@mips.com> <jcowgill@users.noreply.github.com>
+Jaron Kent-Dobias <jaron@kent-dobias.com> <kentdobias@users.noreply.github.com>
 Jason King <jason.king@joyent.com> <jasonbking@users.noreply.github.com>
 Jeff Dike <jdike@akamai.com> <52420226+jdike@users.noreply.github.com>
 Jitendra Patidar <jitendra.patidar@nutanix.com> <53164267+jsai20@users.noreply.github.com>
@ -137,7 +147,9 @@ John L. Hammond <john.hammond@intel.com> <35266395+jhammond-intel@users.noreply.
 John-Mark Gurney <jmg@funkthat.com> <jmgurney@users.noreply.github.com>
 John Ramsden <johnramsden@riseup.net> <johnramsden@users.noreply.github.com>
 Jonathon Fernyhough <jonathon@m2x.dev> <559369+jonathonf@users.noreply.github.com>
+Jose Luis Duran <jlduran@gmail.com> <jlduran@users.noreply.github.com>
 Justin Hibbits <chmeeedalf@gmail.com> <chmeeedalf@users.noreply.github.com>
+Kevin Greene <kevin.greene@delphix.com> <104801862+kxgreene@users.noreply.github.com>
 Kevin Jin <lostking2008@hotmail.com> <33590050+jxdking@users.noreply.github.com>
 Kevin P. Fleming <kevin@km6g.us> <kpfleming@users.noreply.github.com>
 Krzysztof Piecuch <piecuch@kpiecuch.pl> <3964215+pikrzysztof@users.noreply.github.com>
@ -148,9 +160,11 @@ Lorenz Hüdepohl <dev@stellardeath.org> <lhuedepohl@users.noreply.github.com>
 Luís Henriques <henrix@camandro.org> <73643340+lumigch@users.noreply.github.com>
 Marcin Skarbek <git@skarbek.name> <mskarbek@users.noreply.github.com>
 Matt Fiddaman <github@m.fiddaman.uk> <81489167+matt-fidd@users.noreply.github.com>
+Maxim Filimonov <che@bein.link> <part1zano@users.noreply.github.com>
 Max Zettlmeißl <max@zettlmeissl.de> <6818198+maxz@users.noreply.github.com>
 Michael Niewöhner <foss@mniewoehner.de> <c0d3z3r0@users.noreply.github.com>
 Michael Zhivich <mzhivich@akamai.com> <33133421+mzhivich@users.noreply.github.com>
+MigeljanImeri <ImeriMigel@gmail.com> <78048439+MigeljanImeri@users.noreply.github.com>
 Mo Zhou <cdluminate@gmail.com> <5723047+cdluminate@users.noreply.github.com>
 Nick Mattis <nickm970@gmail.com> <nmattis@users.noreply.github.com>
 omni <omni+vagant@hack.org> <79493359+omnivagant@users.noreply.github.com>
@ -164,6 +178,7 @@ Ping Huang <huangping@smartx.com> <101400146+hpingfs@users.noreply.github.com>
 Piotr P. Stefaniak <pstef@freebsd.org> <pstef@users.noreply.github.com>
 Richard Allen <belperite@gmail.com> <33836503+belperite@users.noreply.github.com>
 Rich Ercolani <rincebrain@gmail.com> <214141+rincebrain@users.noreply.github.com>
+Rick Macklem <rmacklem@uoguelph.ca> <64620010+rmacklem@users.noreply.github.com>
 Rob Wing <rob.wing@klarasystems.com> <98866084+rob-wing@users.noreply.github.com>
 Roman Strashkin <roman.strashkin@nexenta.com> <Ramzec@users.noreply.github.com>
 Ryan Hirasaki <ryanhirasaki@gmail.com> <4690732+RyanHir@users.noreply.github.com>
@ -174,6 +189,8 @@ Scott Colby <scott@scolby.com> <scolby33@users.noreply.github.com>
 Sean Eric Fagan <kithrup@mac.com> <kithrup@users.noreply.github.com>
 Spencer Kinny <spencerkinny1995@gmail.com> <30333052+Spencer-Kinny@users.noreply.github.com>
 Srikanth N S <srikanth.nagasubbaraoseetharaman@hpe.com> <75025422+nssrikanth@users.noreply.github.com>
+Stefan Lendl <s.lendl@proxmox.com> <1321542+stfl@users.noreply.github.com>
+Thomas Bertschinger <bertschinger@lanl.gov> <101425190+bertschinger@users.noreply.github.com>
 Thomas Geppert <geppi@digitx.de> <geppi@users.noreply.github.com>
 Tim Crawford <tcrawford@datto.com> <crawfxrd@users.noreply.github.com>
 Tom Matthews <tom@axiom-partners.com> <tomtastic@users.noreply.github.com>
@ -181,6 +198,7 @@ Tony Perkins <tperkins@datto.com> <62951051+tony-zfs@users.noreply.github.com>
 Torsten Wörtwein <twoertwein@gmail.com> <twoertwein@users.noreply.github.com>
 Tulsi Jain <tulsi.jain@delphix.com> <TulsiJain@users.noreply.github.com>
 Václav Skála <skala@vshosting.cz> <33496485+vaclavskala@users.noreply.github.com>
+Vaibhav Bhanawat <vaibhav.bhanawat@delphix.com> <88050553+vaibhav-delphix@users.noreply.github.com>
 Violet Purcell <vimproved@inventati.org> <66446404+vimproved@users.noreply.github.com>
 Vipin Kumar Verma <vipin.verma@hpe.com> <75025470+vermavipinkumar@users.noreply.github.com>
 Wolfgang Bumiller <w.bumiller@proxmox.com> <Blub@users.noreply.github.com>
--- a/sys/contrib/openzfs/AUTHORS
+++ b/sys/contrib/openzfs/AUTHORS
@ -88,9 +88,11 @@ CONTRIBUTORS:
    Bassu <bassu@phi9.com>
    Ben Allen <bsallen@alcf.anl.gov>
    Ben Cordero <bencord0@condi.me>
+    Benda Xu <orv@debian.org>
    Benedikt Neuffer <github@itfriend.de>
    Benjamin Albrecht <git@albrecht.io>
    Benjamin Gentil <benjgentil.pro@gmail.com>
+    Benjamin Sherman <benjamin@holyarmy.org>
    Ben McGough <bmcgough@fredhutch.org>
    Ben Rubson <ben.rubson@gmail.com>
    Ben Wolsieffer <benwolsieffer@gmail.com>
@ -111,6 +113,7 @@ CONTRIBUTORS:
    bzzz77 <bzzz.tomas@gmail.com>
    cable2999 <cable2999@users.noreply.github.com>
    Caleb James DeLisle <calebdelisle@lavabit.com>
+    Cameron Harr <harr1@llnl.gov>
    Cao Xuewen <cao.xuewen@zte.com.cn>
    Carlo Landmeter <clandmeter@gmail.com>
    Carlos Alberto Lopez Perez <clopez@igalia.com>
@ -120,12 +123,15 @@ CONTRIBUTORS:
    Chen Can <chen.can2@zte.com.cn>
    Chengfei Zhu <chengfeix.zhu@intel.com>
    Chen Haiquan <oc@yunify.com>
+    ChenHao Lu <18302010006@fudan.edu.cn>
    Chip Parker <aparker@enthought.com>
    Chris Burroughs <chris.burroughs@gmail.com>
+    Chris Davidson <christopher.davidson@gmail.com>
    Chris Dunlap <cdunlap@llnl.gov>
    Chris Dunlop <chris@onthe.net.au>
    Chris Lindee <chris.lindee+github@gmail.com>
    Chris McDonough <chrism@plope.com>
+    Chris Peredun <chris.peredun@ixsystems.com>
    Chris Siden <chris.siden@delphix.com>
    Chris Siebenmann <cks.github@cs.toronto.edu>
    Christer Ekholm <che@chrekh.se>
@ -144,6 +150,7 @@ CONTRIBUTORS:
    Clint Armstrong <clint@clintarmstrong.net>
    Coleman Kane <ckane@colemankane.org>
    Colin Ian King <colin.king@canonical.com>
+    Colin Percival <cperciva@tarsnap.com>
    Colm Buckley <colm@tuatha.org>
    Crag Wang <crag0715@gmail.com>
    Craig Loomis <cloomis@astro.princeton.edu>
@ -156,6 +163,7 @@ CONTRIBUTORS:
    Damiano Albani <damiano.albani@gmail.com>
    Damian Szuberski <szuberskidamian@gmail.com>
    Damian Wojsław <damian@wojslaw.pl>
+    Daniel Berlin <dberlin@dberlin.org>
    Daniel Hiepler <d-git@coderdu.de>
    Daniel Hoffman <dj.hoffman@delphix.com>
    Daniel Kobras <d.kobras@science-computing.de>
@ -176,8 +184,10 @@ CONTRIBUTORS:
    David Quigley <david.quigley@intel.com>
    Debabrata Banerjee <dbanerje@akamai.com>
    D. Ebdrup <debdrup@freebsd.org>
+    Dennis R. Friedrichsen <dennis.r.friedrichsen@gmail.com>
    Denys Rtveliashvili <denys@rtveliashvili.name>
    Derek Dai <daiderek@gmail.com>
+    Dex Wood <slash2314@gmail.com>
    DHE <git@dehacked.net>
    Didier Roche <didrocks@ubuntu.com>
    Dimitri John Ledkov <xnox@ubuntu.com>
@ -235,9 +245,11 @@ CONTRIBUTORS:
    Gionatan Danti <g.danti@assyoma.it>
    Giuseppe Di Natale <guss80@gmail.com>
    Glenn Washburn <development@efficientek.com>
+    gofaster <felix.gofaster@gmail.com>
    Gordan Bobic <gordan@redsleeve.org>
    Gordon Bergling <gbergling@googlemail.com>
    Gordon Ross <gwr@nexenta.com>
+    Gordon Tetlow <gordon@freebsd.org>
    Graham Christensen <graham@grahamc.com>
    Graham Perrin <grahamperrin@gmail.com>
    Gregor Kopka <gregor@kopka.net>
@ -265,6 +277,7 @@ CONTRIBUTORS:
    Igor Kozhukhov <ikozhukhov@gmail.com>
    Igor Lvovsky <ilvovsky@gmail.com>
    ilbsmart <wgqimut@gmail.com>
+    Ilkka Sovanto <github@ilkka.kapsi.fi>
    illiliti <illiliti@protonmail.com>
    ilovezfs <ilovezfs@icloud.com>
    InsanePrawn <Insane.Prawny@gmail.com>
@ -280,9 +293,11 @@ CONTRIBUTORS:
    Jan Engelhardt <jengelh@inai.de>
    Jan Kryl <jan.kryl@nexenta.com>
    Jan Sanislo <oystr@cs.washington.edu>
+    Jaron Kent-Dobias <jaron@kent-dobias.com>
    Jason Cohen <jwittlincohen@gmail.com>
    Jason Harmening <jason.harmening@gmail.com>
    Jason King <jason.brian.king@gmail.com>
+    Jason Lee <jasonlee@lanl.gov>
    Jason Zaman <jasonzaman@gmail.com>
    Javen Wu <wu.javen@gmail.com>
    Jean-Baptiste Lallement <jean-baptiste@ubuntu.com>
@ -313,6 +328,7 @@ CONTRIBUTORS:
    Jonathon Fernyhough <jonathon@m2x.dev>
    Jorgen Lundman <lundman@lundman.net>
    Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
+    Jose Luis Duran <jlduran@gmail.com>
    Josh Soref <jsoref@users.noreply.github.com>
    Joshua M. Clulow <josh@sysmgr.org>
    José Luis Salvador Rufo <salvador.joseluis@gmail.com>
@ -336,8 +352,10 @@ CONTRIBUTORS:
    Kash Pande <kash@tripleback.net>
    Kay Pedersen <christianpe96@gmail.com>
    Keith M Wesolowski <wesolows@foobazco.org>
+    Kent Ross <k@mad.cash>
    KernelOfTruth <kerneloftruth@gmail.com>
    Kevin Bowling <kevin.bowling@kev009.com>
+    Kevin Greene <kevin.greene@delphix.com>
    Kevin Jin <lostking2008@hotmail.com>
    Kevin P. Fleming <kevin@km6g.us>
    Kevin Tanguy <kevin.tanguy@ovh.net>
@ -389,6 +407,7 @@ CONTRIBUTORS:
    Mark Shellenbaum <Mark.Shellenbaum@Oracle.COM>
    marku89 <mar42@kola.li>
    Mark Wright <markwright@internode.on.net>
+    Mart Frauenlob <allkind@fastest.cc>
    Martin Matuska <mm@FreeBSD.org>
    Martin Rüegg <martin.rueegg@metaworx.ch>
    Massimo Maggi <me@massimo-maggi.eu>
@ -405,6 +424,7 @@ CONTRIBUTORS:
    Matus Kral <matuskral@me.com>
    Mauricio Faria de Oliveira <mfo@canonical.com>
    Max Grossman <max.grossman@delphix.com>
+    Maxim Filimonov <che@bein.link>
    Maximilian Mehnert <maximilian.mehnert@gmx.de>
    Max Zettlmeißl <max@zettlmeissl.de>
    Md Islam <mdnahian@outlook.com>
@ -417,6 +437,7 @@ CONTRIBUTORS:
    Michael Niewöhner <foss@mniewoehner.de>
    Michael Zhivich <mzhivich@akamai.com>
    Michal Vasilek <michal@vasilek.cz>
+    MigeljanImeri <ImeriMigel@gmail.com>
    Mike Gerdts <mike.gerdts@joyent.com>
    Mike Harsch <mike@harschsystems.com>
    Mike Leddy <mike.leddy@gmail.com>
@ -448,6 +469,7 @@ CONTRIBUTORS:
    Olaf Faaland <faaland1@llnl.gov>
    Oleg Drokin <green@linuxhacker.ru>
    Oleg Stepura <oleg@stepura.com>
+    Olivier Certner <olce.freebsd@certner.fr>
    Olivier Mazouffre <olivier.mazouffre@ims-bordeaux.fr>
    omni <omni+vagant@hack.org>
    Orivej Desh <orivej@gmx.fr>
@ -479,6 +501,7 @@ CONTRIBUTORS:
    Prasad Joshi <prasadjoshi124@gmail.com>
    privb0x23 <privb0x23@users.noreply.github.com>
    P.SCH <p88@yahoo.com>
+    Quartz <yyhran@163.com>
    Quentin Zdanis <zdanisq@gmail.com>
    Rafael Kitover <rkitover@gmail.com>
    RageLtMan <sempervictus@users.noreply.github.com>
@ -491,11 +514,15 @@ CONTRIBUTORS:
    Riccardo Schirone <rschirone91@gmail.com>
    Richard Allen <belperite@gmail.com>
    Richard Elling <Richard.Elling@RichardElling.com>
+    Richard Kojedzinszky <richard@kojedz.in>
    Richard Laager <rlaager@wiktel.com>
    Richard Lowe <richlowe@richlowe.net>
    Richard Sharpe <rsharpe@samba.org>
    Richard Yao <ryao@gentoo.org>
    Rich Ercolani <rincebrain@gmail.com>
+    Rick Macklem <rmacklem@uoguelph.ca>
+    rilysh <nightquick@proton.me>
+    Robert Evans <evansr@google.com>
    Robert Novak <sailnfool@gmail.com>
    Roberto Ricci <ricci@disroot.org>
    Rob Norris <robn@despairlabs.com>
@ -509,7 +536,9 @@ CONTRIBUTORS:
    Ryan Lahfa <masterancpp@gmail.com>
    Ryan Libby <rlibby@FreeBSD.org>
    Ryan Moeller <freqlabs@FreeBSD.org>
+    Sam Atkinson <samatk@amazon.com>
    Sam Hathaway <github.com@munkynet.org>
+    Sam James <sam@gentoo.org>
    Sam Lunt <samuel.j.lunt@gmail.com>
    Samuel VERSCHELDE <stormi-github@ylix.fr>
    Samuel Wycliffe <samuelwycliffe@gmail.com>
@ -530,6 +559,8 @@ CONTRIBUTORS:
    Shaan Nobee <sniper111@gmail.com>
    Shampavman <sham.pavman@nexenta.com>
    Shaun Tancheff <shaun@aeonazure.com>
+    Shawn Bayern <sbayern@law.fsu.edu>
+    Shengqi Chen <harry-chen@outlook.com>
    Shen Yan <shenyanxxxy@qq.com>
    Simon Guest <simon.guest@tesujimath.org>
    Simon Klinkert <simon.klinkert@gmail.com>
@ -537,6 +568,7 @@ CONTRIBUTORS:
    Spencer Kinny <spencerkinny1995@gmail.com>
    Srikanth N S <srikanth.nagasubbaraoseetharaman@hpe.com>
    Stanislav Seletskiy <s.seletskiy@gmail.com>
+    Stefan Lendl <s.lendl@proxmox.com>
    Steffen Müthing <steffen.muething@iwr.uni-heidelberg.de>
    Stephen Blinick <stephen.blinick@delphix.com>
    sterlingjensen <sterlingjensen@users.noreply.github.com>
@ -557,6 +589,7 @@ CONTRIBUTORS:
    Teodor Spæren <teodor_spaeren@riseup.net>
    TerraTech <TerraTech@users.noreply.github.com>
    Thijs Cramer <thijs.cramer@gmail.com>
+    Thomas Bertschinger <bertschinger@lanl.gov>
    Thomas Geppert <geppi@digitx.de>
    Thomas Lamprecht <guggentom@hotmail.de>
    Till Maas <opensource@till.name>
@ -586,6 +619,7 @@ CONTRIBUTORS:
    Turbo Fredriksson <turbo@bayour.com>
    Tyler J. Stachecki <stachecki.tyler@gmail.com>
    Umer Saleem <usaleem@ixsystems.com>
+    Vaibhav Bhanawat <vaibhav.bhanawat@delphix.com>
    Valmiky Arquissandas <kayvlim@gmail.com>
    Val Packett <val@packett.cool>
    Vince van Oosten <techhazard@codeforyouand.me>
@ -614,6 +648,7 @@ CONTRIBUTORS:
    yuina822 <ayuichi@club.kyutech.ac.jp>
    YunQiang Su <syq@debian.org>
    Yuri Pankov <yuri.pankov@gmail.com>
+    Yuxin Wang <yuxinwang9999@gmail.com>
    Yuxuan Shui <yshuiv7@gmail.com>
    Zachary Bedell <zac@thebedells.org>
    Zach Dykstra <dykstra.zachary@gmail.com>
--- a/sys/contrib/openzfs/cmd/arc_summary
+++ b/sys/contrib/openzfs/cmd/arc_summary
@ -793,18 +793,27 @@ def section_dmu(kstats_dict):

    zfetch_stats = isolate_section('zfetchstats', kstats_dict)

-    zfetch_access_total = int(zfetch_stats['hits'])+int(zfetch_stats['misses'])
+    zfetch_access_total = int(zfetch_stats['hits']) +\
+        int(zfetch_stats['future']) + int(zfetch_stats['stride']) +\
+        int(zfetch_stats['past']) + int(zfetch_stats['misses'])

    prt_1('DMU predictive prefetcher calls:', f_hits(zfetch_access_total))
    prt_i2('Stream hits:',
           f_perc(zfetch_stats['hits'], zfetch_access_total),
           f_hits(zfetch_stats['hits']))
+    future = int(zfetch_stats['future']) + int(zfetch_stats['stride'])
+    prt_i2('Hits ahead of stream:', f_perc(future, zfetch_access_total),
+           f_hits(future))
+    prt_i2('Hits behind stream:',
+           f_perc(zfetch_stats['past'], zfetch_access_total),
+           f_hits(zfetch_stats['past']))
    prt_i2('Stream misses:',
           f_perc(zfetch_stats['misses'], zfetch_access_total),
           f_hits(zfetch_stats['misses']))
    prt_i2('Streams limit reached:',
           f_perc(zfetch_stats['max_streams'], zfetch_stats['misses']),
           f_hits(zfetch_stats['max_streams']))
+    prt_i1('Stream strides:', f_hits(zfetch_stats['stride']))
    prt_i1('Prefetches issued', f_hits(zfetch_stats['io_issued']))
    print()

--- a/sys/contrib/openzfs/cmd/zfs/zfs_main.c
+++ b/sys/contrib/openzfs/cmd/zfs/zfs_main.c
@ -309,7 +309,8 @@ get_usage(zfs_help_t idx)
 		    "[filesystem|volume|snapshot] ...\n"));
 	case HELP_MOUNT:
 		return (gettext("\tmount\n"
-		    "\tmount [-flvO] [-o opts] <-a | filesystem>\n"));
+		    "\tmount [-flvO] [-o opts] <-a|-R filesystem|"
+		    "filesystem>\n"));
 	case HELP_PROMOTE:
 		return (gettext("\tpromote <clone-filesystem>\n"));
 	case HELP_RECEIVE:
@ -6754,6 +6755,8 @@ zfs_do_holds(int argc, char **argv)
 #define	MOUNT_TIME 1		/* seconds */

 typedef struct get_all_state {
+	char		**ga_datasets;
+	int		ga_count;
 	boolean_t	ga_verbose;
 	get_all_cb_t	*ga_cbp;
 } get_all_state_t;
@ -6800,19 +6803,35 @@ get_one_dataset(zfs_handle_t *zhp, void *data)
 	return (0);
 }

-static void
-get_all_datasets(get_all_cb_t *cbp, boolean_t verbose)
+static int
+get_recursive_datasets(zfs_handle_t *zhp, void *data)
 {
-	get_all_state_t state = {
-	    .ga_verbose = verbose,
-	    .ga_cbp = cbp
-	};
+	get_all_state_t *state = data;
+	int len = strlen(zfs_get_name(zhp));
+	for (int i = 0; i < state->ga_count; ++i) {
+		if (strcmp(state->ga_datasets[i], zfs_get_name(zhp)) == 0)
+			return (get_one_dataset(zhp, data));
+		else if ((strncmp(state->ga_datasets[i], zfs_get_name(zhp),
+		    len) == 0) && state->ga_datasets[i][len] == '/') {
+			(void) zfs_iter_filesystems_v2(zhp, 0,
+			    get_recursive_datasets, data);
+		}
+	}
+	zfs_close(zhp);
+	return (0);
+}

-	if (verbose)
+static void
+get_all_datasets(get_all_state_t *state)
+{
+	if (state->ga_verbose)
 		set_progress_header(gettext("Reading ZFS config"));
-	(void) zfs_iter_root(g_zfs, get_one_dataset, &state);
+	if (state->ga_datasets == NULL)
+		(void) zfs_iter_root(g_zfs, get_one_dataset, state);
+	else
+		(void) zfs_iter_root(g_zfs, get_recursive_datasets, state);

-	if (verbose)
+	if (state->ga_verbose)
 		finish_progress(gettext("done."));
 }

@ -7158,18 +7177,22 @@ static int
 share_mount(int op, int argc, char **argv)
 {
 	int do_all = 0;
+	int recursive = 0;
 	boolean_t verbose = B_FALSE;
 	int c, ret = 0;
 	char *options = NULL;
 	int flags = 0;

 	/* check options */
-	while ((c = getopt(argc, argv, op == OP_MOUNT ? ":alvo:Of" : "al"))
+	while ((c = getopt(argc, argv, op == OP_MOUNT ? ":aRlvo:Of" : "al"))
 	    != -1) {
 		switch (c) {
 		case 'a':
 			do_all = 1;
 			break;
+		case 'R':
+			recursive = 1;
+			break;
 		case 'v':
 			verbose = B_TRUE;
 			break;
@ -7211,7 +7234,7 @@ share_mount(int op, int argc, char **argv)
 	argv += optind;

 	/* check number of arguments */
-	if (do_all) {
+	if (do_all || recursive) {
 		enum sa_protocol protocol = SA_NO_PROTOCOL;

 		if (op == OP_SHARE && argc > 0) {
@ -7220,14 +7243,38 @@ share_mount(int op, int argc, char **argv)
 			argv++;
 		}

-		if (argc != 0) {
+		if (argc != 0 && do_all) {
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}

+		if (argc == 0 && recursive) {
+			(void) fprintf(stderr,
+			    gettext("no dataset provided\n"));
+			usage(B_FALSE);
+		}
+
 		start_progress_timer();
 		get_all_cb_t cb = { 0 };
-		get_all_datasets(&cb, verbose);
+		get_all_state_t state = { 0 };
+		if (argc == 0) {
+			state.ga_datasets = NULL;
+			state.ga_count = -1;
+		} else {
+			zfs_handle_t *zhp;
+			for (int i = 0; i < argc; i++) {
+				zhp = zfs_open(g_zfs, argv[i],
+				    ZFS_TYPE_FILESYSTEM);
+				if (zhp == NULL)
+					usage(B_FALSE);
+				zfs_close(zhp);
+			}
+			state.ga_datasets = argv;
+			state.ga_count = argc;
+		}
+		state.ga_verbose = verbose;
+		state.ga_cbp = &cb;
+		get_all_datasets(&state);

 		if (cb.cb_used == 0) {
 			free(options);
--- a/sys/contrib/openzfs/cmd/zinject/zinject.c
+++ b/sys/contrib/openzfs/cmd/zinject/zinject.c
@ -22,6 +22,7 @@
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
 * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2024, Klara Inc.
 */

 /*
@ -208,6 +209,38 @@ type_to_name(uint64_t type)
 	}
 }

+struct errstr {
+	int		err;
+	const char	*str;
+};
+static const struct errstr errstrtable[] = {
+	{ EIO,		"io" },
+	{ ECKSUM,	"checksum" },
+	{ EINVAL,	"decompress" },
+	{ EACCES,	"decrypt" },
+	{ ENXIO,	"nxio" },
+	{ ECHILD,	"dtl" },
+	{ EILSEQ,	"corrupt" },
+	{ ENOSYS,	"noop" },
+	{ 0, NULL },
+};
+
+static int
+str_to_err(const char *str)
+{
+	for (int i = 0; errstrtable[i].str != NULL; i++)
+		if (strcasecmp(errstrtable[i].str, str) == 0)
+			return (errstrtable[i].err);
+	return (-1);
+}
+static const char *
+err_to_str(int err)
+{
+	for (int i = 0; errstrtable[i].str != NULL; i++)
+		if (errstrtable[i].err == err)
+			return (errstrtable[i].str);
+	return ("[unknown]");
+}

 /*
 * Print usage message.
@ -233,12 +266,12 @@ usage(void)
 	    "\t\tspa_vdev_exit() will trigger a panic.\n"
 	    "\n"
 	    "\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n"
-	    "\t\t[-T <read|write|free|claim|all>] [-f frequency] pool\n\n"
+	    "\t\t[-T <read|write|free|claim|flush|all>] [-f frequency] pool\n\n"
 	    "\t\tInject a fault into a particular device or the device's\n"
 	    "\t\tlabel.  Label injection can either be 'nvlist', 'uber',\n "
 	    "\t\t'pad1', or 'pad2'.\n"
-	    "\t\t'errno' can be 'nxio' (the default), 'io', 'dtl', or\n"
-	    "\t\t'corrupt' (bit flip).\n"
+	    "\t\t'errno' can be 'nxio' (the default), 'io', 'dtl',\n"
+	    "\t\t'corrupt' (bit flip), or 'noop' (successfully do nothing).\n"
 	    "\t\t'frequency' is a value between 0.0001 and 100.0 that limits\n"
 	    "\t\tdevice error injection to a percentage of the IOs.\n"
 	    "\n"
@ -392,6 +425,10 @@ static int
 print_device_handler(int id, const char *pool, zinject_record_t *record,
    void *data)
 {
+	static const char *iotypestr[] = {
+	    "null", "read", "write", "free", "claim", "flush", "trim", "all",
+	};
+
 	int *count = data;

 	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
@ -401,14 +438,21 @@ print_device_handler(int id, const char *pool, zinject_record_t *record,
 		return (0);

 	if (*count == 0) {
-		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "GUID");
-		(void) printf("---  ---------------  ----------------\n");
+		(void) printf("%3s  %-15s  %-16s  %-5s  %-10s  %-9s\n",
+		    "ID", "POOL", "GUID", "TYPE", "ERROR", "FREQ");
+		(void) printf(
+		    "---  ---------------  ----------------  "
+		    "-----  ----------  ---------\n");
 	}

 	*count += 1;

-	(void) printf("%3d  %-15s  %llx\n", id, pool,
-	    (u_longlong_t)record->zi_guid);
+	double freq = record->zi_freq == 0 ? 100.0f :
+	    (((double)record->zi_freq) / ZI_PERCENTAGE_MAX) * 100.0f;
+
+	(void) printf("%3d  %-15s  %llx  %-5s  %-10s  %8.4f%%\n", id, pool,
+	    (u_longlong_t)record->zi_guid, iotypestr[record->zi_iotype],
+	    err_to_str(record->zi_error), freq);

 	return (0);
 }
@ -842,24 +886,12 @@ main(int argc, char **argv)
 			}
 			break;
 		case 'e':
-			if (strcasecmp(optarg, "io") == 0) {
-				error = EIO;
-			} else if (strcasecmp(optarg, "checksum") == 0) {
-				error = ECKSUM;
-			} else if (strcasecmp(optarg, "decompress") == 0) {
-				error = EINVAL;
-			} else if (strcasecmp(optarg, "decrypt") == 0) {
-				error = EACCES;
-			} else if (strcasecmp(optarg, "nxio") == 0) {
-				error = ENXIO;
-			} else if (strcasecmp(optarg, "dtl") == 0) {
-				error = ECHILD;
-			} else if (strcasecmp(optarg, "corrupt") == 0) {
-				error = EILSEQ;
-			} else {
+			error = str_to_err(optarg);
+			if (error < 0) {
 				(void) fprintf(stderr, "invalid error type "
-				    "'%s': must be 'io', 'checksum' or "
-				    "'nxio'\n", optarg);
+				    "'%s': must be one of: io decompress "
+				    "decrypt nxio dtl corrupt noop\n",
+				    optarg);
 				usage();
 				libzfs_fini(g_zfs);
 				return (1);
@ -947,12 +979,14 @@ main(int argc, char **argv)
 				io_type = ZIO_TYPE_FREE;
 			} else if (strcasecmp(optarg, "claim") == 0) {
 				io_type = ZIO_TYPE_CLAIM;
+			} else if (strcasecmp(optarg, "flush") == 0) {
+				io_type = ZIO_TYPE_FLUSH;
 			} else if (strcasecmp(optarg, "all") == 0) {
 				io_type = ZIO_TYPES;
 			} else {
 				(void) fprintf(stderr, "invalid I/O type "
 				    "'%s': must be 'read', 'write', 'free', "
-				    "'claim' or 'all'\n", optarg);
+				    "'claim', 'flush' or 'all'\n", optarg);
 				usage();
 				libzfs_fini(g_zfs);
 				return (1);
--- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c
@ -2289,7 +2289,6 @@ print_status_initialize(vdev_stat_t *vs, boolean_t verbose)
 		    !vs->vs_scan_removing) {
 			char zbuf[1024];
 			char tbuf[256];
-			struct tm zaction_ts;

 			time_t t = vs->vs_initialize_action_time;
 			int initialize_pct = 100;
@ -2299,8 +2298,8 @@ print_status_initialize(vdev_stat_t *vs, boolean_t verbose)
 				    100 / (vs->vs_initialize_bytes_est + 1));
 			}

-			(void) localtime_r(&t, &zaction_ts);
-			(void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);
+			(void) ctime_r(&t, tbuf);
+			tbuf[24] = 0;

 			switch (vs->vs_initialize_state) {
 			case VDEV_INITIALIZE_SUSPENDED:
@ -2340,7 +2339,6 @@ print_status_trim(vdev_stat_t *vs, boolean_t verbose)
 		    !vs->vs_scan_removing) {
 			char zbuf[1024];
 			char tbuf[256];
-			struct tm zaction_ts;

 			time_t t = vs->vs_trim_action_time;
 			int trim_pct = 100;
@ -2349,8 +2347,8 @@ print_status_trim(vdev_stat_t *vs, boolean_t verbose)
 				    100 / (vs->vs_trim_bytes_est + 1));
 			}

-			(void) localtime_r(&t, &zaction_ts);
-			(void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);
+			(void) ctime_r(&t, tbuf);
+			tbuf[24] = 0;

 			switch (vs->vs_trim_state) {
 			case VDEV_TRIM_SUSPENDED:
@ -10793,11 +10791,10 @@ zpool_do_get(int argc, char **argv)
 		}
 	} else {
 		/*
-		 * The first arg isn't a pool name,
+		 * The first arg isn't the name of a valid pool.
 		 */
-		fprintf(stderr, gettext("missing pool name.\n"));
-		fprintf(stderr, "\n");
-		usage(B_FALSE);
+		fprintf(stderr, gettext("Cannot get properties of %s: "
+		    "no such pool available.\n"), argv[0]);
 		return (1);
 	}

--- a/sys/contrib/openzfs/config/Substfiles.am
+++ b/sys/contrib/openzfs/config/Substfiles.am
@ -18,6 +18,7 @@ subst_sed_cmd = \
 	-e 's|@ASAN_ENABLED[@]|$(ASAN_ENABLED)|g' \
 	-e 's|@DEFAULT_INIT_NFS_SERVER[@]|$(DEFAULT_INIT_NFS_SERVER)|g' \
 	-e 's|@DEFAULT_INIT_SHELL[@]|$(DEFAULT_INIT_SHELL)|g' \
+	-e 's|@IS_SYSV_RC[@]|$(IS_SYSV_RC)|g' \
 	-e 's|@LIBFETCH_DYNAMIC[@]|$(LIBFETCH_DYNAMIC)|g' \
 	-e 's|@LIBFETCH_SONAME[@]|$(LIBFETCH_SONAME)|g' \
 	-e 's|@PYTHON[@]|$(PYTHON)|g' \
@ -43,4 +44,4 @@ SUBSTFILES =
 CLEANFILES += $(SUBSTFILES)
 dist_noinst_DATA += $(SUBSTFILES:=.in)

-$(call SUBST,%,)
+$(SUBSTFILES): $(call SUBST,%,)
--- a/sys/contrib/openzfs/config/kernel-blk-queue.m4
+++ b/sys/contrib/openzfs/config/kernel-blk-queue.m4
@ -377,6 +377,14 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_MQ], [
 		(void) blk_mq_alloc_tag_set(&tag_set);
 		return BLK_STS_OK;
 	], [])
+	ZFS_LINUX_TEST_SRC([blk_mq_rq_hctx], [
+		#include <linux/blk-mq.h>
+		#include <linux/blkdev.h>
+	], [
+		struct request rq = {0};
+		struct blk_mq_hw_ctx *hctx = NULL;
+		rq.mq_hctx = hctx;
+	], [])
 ])

 AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [
@ -384,6 +392,13 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [
 	ZFS_LINUX_TEST_RESULT([blk_mq], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLK_MQ, 1, [block multiqueue is available])
+		AC_MSG_CHECKING([whether block multiqueue hardware context is cached in struct request])
+		ZFS_LINUX_TEST_RESULT([blk_mq_rq_hctx], [
+			AC_MSG_RESULT(yes)
+			AC_DEFINE(HAVE_BLK_MQ_RQ_HCTX, 1, [block multiqueue hardware context is cached in struct request])
+		], [
+			AC_MSG_RESULT(no)
+		])
 	], [
 		AC_MSG_RESULT(no)
 	])
--- a/sys/contrib/openzfs/config/kernel-blkdev.m4
+++ b/sys/contrib/openzfs/config/kernel-blkdev.m4
@ -54,6 +54,26 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH], [
 	])
 ])

+dnl #
+dnl # 6.9.x API change
+dnl # bdev_file_open_by_path() replaced bdev_open_by_path(),
+dnl # and returns struct file*
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BDEV_FILE_OPEN_BY_PATH], [
+	ZFS_LINUX_TEST_SRC([bdev_file_open_by_path], [
+		#include <linux/fs.h>
+		#include <linux/blkdev.h>
+	], [
+		struct file *file __attribute__ ((unused)) = NULL;
+		const char *path = "path";
+		fmode_t mode = 0;
+		void *holder = NULL;
+		struct blk_holder_ops h;
+
+		file = bdev_file_open_by_path(path, mode, holder, &h);
+	])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [
 	AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 3 args])
 	ZFS_LINUX_TEST_RESULT([blkdev_get_by_path], [
@ -73,7 +93,16 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [
 					[bdev_open_by_path() exists])
 				AC_MSG_RESULT(yes)
 			], [
-				ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()])
+				AC_MSG_RESULT(no)
+				AC_MSG_CHECKING([whether bdev_file_open_by_path() exists])
+				ZFS_LINUX_TEST_RESULT([bdev_file_open_by_path], [
+					AC_DEFINE(HAVE_BDEV_FILE_OPEN_BY_PATH, 1,
+						[bdev_file_open_by_path() exists])
+					AC_MSG_RESULT(yes)
+				], [
+					AC_MSG_RESULT(no)
+					ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()])
+				])
 			])
 		])
 	])
@ -149,10 +178,19 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_RELEASE], [
 	])
 ])

+dnl #
+dnl # 6.9.x API change
+dnl #
+dnl # bdev_release() now private, but because bdev_file_open_by_path() returns
+dnl # struct file*, we can just use fput(). So the blkdev_put test no longer
+dnl # fails if not found.
+dnl #
+
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [
 	AC_MSG_CHECKING([whether blkdev_put() exists])
 	ZFS_LINUX_TEST_RESULT([blkdev_put], [
 		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLKDEV_PUT, 1, [blkdev_put() exists])
 	], [
 		AC_MSG_RESULT(no)
 		AC_MSG_CHECKING([whether blkdev_put() accepts void* as arg 2])
@ -168,7 +206,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [
 				AC_DEFINE(HAVE_BDEV_RELEASE, 1,
 					[bdev_release() exists])
 			], [
-				ZFS_LINUX_TEST_ERROR([blkdev_put()])
+				AC_MSG_RESULT(no)
 			])
 		])
 	])
@ -523,12 +561,29 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEVNAME], [
 ])

 dnl #
-dnl # 5.19 API: blkdev_issue_secure_erase()
-dnl # 4.7  API: __blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
-dnl # 3.10 API: blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
+dnl # TRIM support: discard and secure erase. We make use of asynchronous
+dnl #               functions when available.
 dnl #
-AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [
-	ZFS_LINUX_TEST_SRC([blkdev_issue_secure_erase], [
+dnl # 3.10:
+dnl #   sync discard:  blkdev_issue_discard(..., 0)
+dnl #   sync erase:    blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
+dnl #   async discard: [not available]
+dnl #   async erase:   [not available]
+dnl #
+dnl # 4.7:
+dnl #   sync discard:  blkdev_issue_discard(..., 0)
+dnl #   sync erase:    blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
+dnl #   async discard: __blkdev_issue_discard(..., 0)
+dnl #   async erase:   __blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
+dnl #
+dnl # 5.19:
+dnl #   sync discard:  blkdev_issue_discard(...)
+dnl #   sync erase:    blkdev_issue_secure_erase(...)
+dnl #   async discard: __blkdev_issue_discard(...)
+dnl #   async erase:   [not available]
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_DISCARD], [
+	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_noflags], [
 		#include <linux/blkdev.h>
 	],[
 		struct block_device *bdev = NULL;
@ -536,10 +591,33 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [
 		sector_t nr_sects = 0;
 		int error __attribute__ ((unused));

-		error = blkdev_issue_secure_erase(bdev,
+		error = blkdev_issue_discard(bdev,
 		    sector, nr_sects, GFP_KERNEL);
 	])
+	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_flags], [
+		#include <linux/blkdev.h>
+	],[
+		struct block_device *bdev = NULL;
+		sector_t sector = 0;
+		sector_t nr_sects = 0;
+		unsigned long flags = 0;
+		int error __attribute__ ((unused));

+		error = blkdev_issue_discard(bdev,
+		    sector, nr_sects, GFP_KERNEL, flags);
+	])
+	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_async_noflags], [
+		#include <linux/blkdev.h>
+	],[
+		struct block_device *bdev = NULL;
+		sector_t sector = 0;
+		sector_t nr_sects = 0;
+		struct bio *biop = NULL;
+		int error __attribute__ ((unused));
+
+		error = __blkdev_issue_discard(bdev,
+		    sector, nr_sects, GFP_KERNEL, &biop);
+	])
 	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_async_flags], [
 		#include <linux/blkdev.h>
 	],[
@ -553,22 +631,52 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [
 		error = __blkdev_issue_discard(bdev,
 		    sector, nr_sects, GFP_KERNEL, flags, &biop);
 	])
-
-	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_flags], [
+	ZFS_LINUX_TEST_SRC([blkdev_issue_secure_erase], [
 		#include <linux/blkdev.h>
 	],[
 		struct block_device *bdev = NULL;
 		sector_t sector = 0;
 		sector_t nr_sects = 0;
-		unsigned long flags = 0;
 		int error __attribute__ ((unused));

-		error = blkdev_issue_discard(bdev,
-		    sector, nr_sects, GFP_KERNEL, flags);
+		error = blkdev_issue_secure_erase(bdev,
+		    sector, nr_sects, GFP_KERNEL);
 	])
 ])

-AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE], [
+AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_DISCARD], [
+	AC_MSG_CHECKING([whether blkdev_issue_discard() is available])
+	ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_noflags], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS, 1,
+		    [blkdev_issue_discard() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+	AC_MSG_CHECKING([whether blkdev_issue_discard(flags) is available])
+	ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS, 1,
+		    [blkdev_issue_discard(flags) is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+	AC_MSG_CHECKING([whether __blkdev_issue_discard() is available])
+	ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_noflags], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS, 1,
+		    [__blkdev_issue_discard() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+	AC_MSG_CHECKING([whether __blkdev_issue_discard(flags) is available])
+	ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_flags], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS, 1,
+		    [__blkdev_issue_discard(flags) is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
 	AC_MSG_CHECKING([whether blkdev_issue_secure_erase() is available])
 	ZFS_LINUX_TEST_RESULT([blkdev_issue_secure_erase], [
 		AC_MSG_RESULT(yes)
@ -576,24 +684,6 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE], [
 		    [blkdev_issue_secure_erase() is available])
 	],[
 		AC_MSG_RESULT(no)
-
-		AC_MSG_CHECKING([whether __blkdev_issue_discard() is available])
-		ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_flags], [
-			AC_MSG_RESULT(yes)
-			AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC, 1,
-			    [__blkdev_issue_discard() is available])
-		],[
-			AC_MSG_RESULT(no)
-
-			AC_MSG_CHECKING([whether blkdev_issue_discard() is available])
-			ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [
-				AC_MSG_RESULT(yes)
-				AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD, 1,
-					[blkdev_issue_discard() is available])
-			],[
-				ZFS_LINUX_TEST_ERROR([blkdev_issue_discard()])
-			])
-		])
 	])
 ])

@ -645,6 +735,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH
 	ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH
+	ZFS_AC_KERNEL_SRC_BDEV_FILE_OPEN_BY_PATH
 	ZFS_AC_KERNEL_SRC_BLKDEV_PUT
 	ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_RELEASE
@ -657,7 +748,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEVNAME
-	ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE
+	ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_DISCARD
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ
 	ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV
 	ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE
@ -678,7 +769,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
 	ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE
 	ZFS_AC_KERNEL_BLKDEV_BDEVNAME
 	ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS
-	ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE
+	ZFS_AC_KERNEL_BLKDEV_ISSUE_DISCARD
 	ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ
 	ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV
 	ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE
--- a/sys/contrib/openzfs/config/kernel-make-request-fn.m4
+++ b/sys/contrib/openzfs/config/kernel-make-request-fn.m4
@ -50,6 +50,14 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [
 		disk = blk_alloc_disk(NUMA_NO_NODE);
 	])

+	ZFS_LINUX_TEST_SRC([blk_alloc_disk_2arg], [
+		#include <linux/blkdev.h>
+	],[
+		struct queue_limits *lim = NULL;
+		struct gendisk *disk  __attribute__ ((unused));
+		disk = blk_alloc_disk(lim, NUMA_NO_NODE);
+	])
+
 	ZFS_LINUX_TEST_SRC([blk_cleanup_disk], [
 		#include <linux/blkdev.h>
 	],[
@ -96,6 +104,31 @@ AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [
 		], [
 			AC_MSG_RESULT(no)
 		])
+
+		dnl #
+		dnl # Linux 6.9 API Change:
+		dnl # blk_alloc_queue() takes a nullable queue_limits arg.
+		dnl #
+		AC_MSG_CHECKING([whether blk_alloc_disk() exists and takes 2 args])
+		ZFS_LINUX_TEST_RESULT([blk_alloc_disk_2arg], [
+			AC_MSG_RESULT(yes)
+			AC_DEFINE([HAVE_BLK_ALLOC_DISK_2ARG], 1, [blk_alloc_disk() exists and takes 2 args])
+
+			dnl #
+			dnl # 5.20 API change,
+			dnl # Removed blk_cleanup_disk(), put_disk() should be used.
+			dnl #
+			AC_MSG_CHECKING([whether blk_cleanup_disk() exists])
+			ZFS_LINUX_TEST_RESULT([blk_cleanup_disk], [
+				AC_MSG_RESULT(yes)
+				AC_DEFINE([HAVE_BLK_CLEANUP_DISK], 1,
+				    [blk_cleanup_disk() exists])
+			], [
+				AC_MSG_RESULT(no)
+			])
+		], [
+			AC_MSG_RESULT(no)
+		])
 	],[
 		AC_MSG_RESULT(no)

--- a/sys/contrib/openzfs/config/zfs-build.m4
+++ b/sys/contrib/openzfs/config/zfs-build.m4
@ -578,13 +578,15 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [

 	AC_MSG_CHECKING([default shell])
 	case "$VENDOR" in
-		gentoo)     DEFAULT_INIT_SHELL="/sbin/openrc-run";;
-		alpine)     DEFAULT_INIT_SHELL="/sbin/openrc-run";;
-		*)          DEFAULT_INIT_SHELL="/bin/sh"         ;;
+		gentoo|alpine)	DEFAULT_INIT_SHELL=/sbin/openrc-run
+				IS_SYSV_RC=false	;;
+		*)		DEFAULT_INIT_SHELL=/bin/sh
+				IS_SYSV_RC=true		;;
 	esac

 	AC_MSG_RESULT([$DEFAULT_INIT_SHELL])
 	AC_SUBST(DEFAULT_INIT_SHELL)
+	AC_SUBST(IS_SYSV_RC)

 	AC_MSG_CHECKING([default nfs server init script])
 	AS_IF([test "$VENDOR" = "debian"],
--- a/sys/contrib/openzfs/etc/init.d/README.md
+++ b/sys/contrib/openzfs/etc/init.d/README.md
@ -7,11 +7,7 @@ DESCRIPTION

  They have been tested successfully on:

-    * Debian GNU/Linux Wheezy
-    * Debian GNU/Linux Jessie
-    * Ubuntu Trusty
-    * CentOS 6.0
-    * CentOS 6.6
+    * Debian GNU/Linux Bookworm
    * Gentoo

 SUPPORT
--- a/sys/contrib/openzfs/etc/init.d/zfs-import.in
+++ b/sys/contrib/openzfs/etc/init.d/zfs-import.in
@ -307,7 +307,7 @@ do_start()

 # ----------------------------------------------------

-if [ ! -e /sbin/openrc-run ]
+if @IS_SYSV_RC@
 then
 	case "$1" in
 		start)
--- a/sys/contrib/openzfs/etc/init.d/zfs-load-key.in
+++ b/sys/contrib/openzfs/etc/init.d/zfs-load-key.in
@ -104,7 +104,7 @@ do_stop()

 # ----------------------------------------------------

-if [ ! -e /sbin/openrc-run ]
+if @IS_SYSV_RC@
 then
 	case "$1" in
 		start)
--- a/sys/contrib/openzfs/etc/init.d/zfs-mount.in
+++ b/sys/contrib/openzfs/etc/init.d/zfs-mount.in
@ -114,7 +114,7 @@ do_stop()

 # ----------------------------------------------------

-if [ ! -e /sbin/openrc-run ]
+if @IS_SYSV_RC@
 then
 	case "$1" in
 		start)
--- a/sys/contrib/openzfs/etc/init.d/zfs-share.in
+++ b/sys/contrib/openzfs/etc/init.d/zfs-share.in
@ -57,7 +57,8 @@ do_stop()

 # ----------------------------------------------------

-if [ ! -e /sbin/openrc-run ]; then
+if @IS_SYSV_RC@
+then
 	case "$1" in
 		start)
 			do_start
--- a/sys/contrib/openzfs/etc/init.d/zfs-zed.in
+++ b/sys/contrib/openzfs/etc/init.d/zfs-zed.in
@ -93,7 +93,8 @@ do_reload()

 # ----------------------------------------------------

-if [ ! -e /sbin/openrc-run ]; then
+if @IS_SYSV_RC@
+then
 	case "$1" in
 		start)
 			do_start
--- a/sys/contrib/openzfs/include/os/freebsd/Makefile.am
+++ b/sys/contrib/openzfs/include/os/freebsd/Makefile.am
@ -4,8 +4,6 @@ noinst_HEADERS = \
 	\
 	%D%/spl/acl/acl_common.h \
 	\
-	%D%/spl/rpc/xdr.h \
-	\
 	%D%/spl/sys/ia32/asm_linkage.h \
 	\
 	%D%/spl/sys/acl.h \
@ -22,7 +20,6 @@ noinst_HEADERS = \
 	%D%/spl/sys/debug.h \
 	%D%/spl/sys/dirent.h \
 	%D%/spl/sys/disp.h \
-	%D%/spl/sys/dkio.h \
 	%D%/spl/sys/fcntl.h \
 	%D%/spl/sys/file.h \
 	%D%/spl/sys/freebsd_rwlock.h \
--- a/sys/contrib/openzfs/include/os/freebsd/spl/rpc/xdr.h
+++ b/sys/contrib/openzfs/include/os/freebsd/spl/rpc/xdr.h
@ -1,71 +0,0 @@
-/*
- * Sun RPC is a product of Sun Microsystems, Inc. and is provided for
- * unrestricted use provided that this legend is included on all tape
- * media and as a part of the software program in whole or part.  Users
- * may copy or modify Sun RPC without charge, but are not authorized
- * to license or distribute it to anyone else except as part of a product or
- * program developed by the user.
- *
- * SUN RPC IS PROVIDED AS IS WITH NO WARRANTIES OF ANY KIND INCLUDING THE
- * WARRANTIES OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE, OR ARISING FROM A COURSE OF DEALING, USAGE OR TRADE PRACTICE.
- *
- * Sun RPC is provided with no support and without any obligation on the
- * part of Sun Microsystems, Inc. to assist in its use, correction,
- * modification or enhancement.
- *
- * SUN MICROSYSTEMS, INC. SHALL HAVE NO LIABILITY WITH RESPECT TO THE
- * INFRINGEMENT OF COPYRIGHTS, TRADE SECRETS OR ANY PATENTS BY SUN RPC
- * OR ANY PART THEREOF.
- *
- * In no event will Sun Microsystems, Inc. be liable for any lost revenue
- * or profits or other special, indirect and consequential damages, even if
- * Sun has been advised of the possibility of such damages.
- *
- * Sun Microsystems, Inc.
- * 2550 Garcia Avenue
- * Mountain View, California  94043
- */
-
-#ifndef	_OPENSOLARIS_RPC_XDR_H_
-#define	_OPENSOLARIS_RPC_XDR_H_
-
-#include <rpc/types.h>
-#include_next <rpc/xdr.h>
-
-#if !defined(_KERNEL) && !defined(_STANDALONE)
-
-#include <assert.h>
-
-/*
- * Taken from sys/xdr/xdr_mem.c.
- *
- * FreeBSD's userland XDR doesn't implement control method (only the kernel),
- * but OpenSolaris nvpair still depend on it, so we have to implement it here.
- */
-static __inline bool_t
-xdrmem_control(XDR *xdrs, int request, void *info)
-{
-	xdr_bytesrec *xptr;
-
-	switch (request) {
-	case XDR_GET_BYTES_AVAIL:
-		xptr = (xdr_bytesrec *)info;
-		xptr->xc_is_last_record = TRUE;
-		xptr->xc_num_avail = xdrs->x_handy;
-		return (TRUE);
-	default:
-		assert(!"unexpected request");
-	}
-	return (FALSE);
-}
-
-#undef XDR_CONTROL
-#define	XDR_CONTROL(xdrs, req, op)					\
-	(((xdrs)->x_ops->x_control == NULL) ?				\
-	    xdrmem_control((xdrs), (req), (op)) :			\
-	    (*(xdrs)->x_ops->x_control)(xdrs, req, op))
-
-#endif	/* !_KERNEL && !_STANDALONE */
-
-#endif	/* !_OPENSOLARIS_RPC_XDR_H_ */
--- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/debug.h
+++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/debug.h
@ -56,11 +56,33 @@
 /*
 * Common DEBUG functionality.
 */
+#ifdef __FreeBSD__
+#include <linux/compiler.h>
+#endif
+
+#ifndef __printflike
+#define	__printflike(a, b)	__printf(a, b)
+#endif
+
+#ifndef __maybe_unused
+#define	__maybe_unused __attribute__((unused))
+#endif
+
+/*
+ * Without this, we see warnings from objtool during normal Linux builds when
+ * the kernel is built with CONFIG_STACK_VALIDATION=y:
+ *
+ * warning: objtool: tsd_create() falls through to next function __list_add()
+ * warning: objtool: .text: unexpected end of section
+ *
+ * Until the toolchain stops doing this, we must only define this attribute on
+ * spl_panic() when doing static analysis.
+ */
 #if defined(__COVERITY__) || defined(__clang_analyzer__)
 __attribute__((__noreturn__))
 #endif
 extern void spl_panic(const char *file, const char *func, int line,
-    const char *fmt, ...) __attribute__((__noreturn__));
+    const char *fmt, ...);
 extern void spl_dumpstack(void);

 static inline int
@ -73,8 +95,10 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 #ifndef expect
 #define	expect(expr, value) (__builtin_expect((expr), (value)))
 #endif
+#ifndef __linux__
 #define	likely(expr)   expect((expr) != 0, 1)
 #define	unlikely(expr) expect((expr) != 0, 0)
+#endif

 #define	PANIC(fmt, a...)						\
 	spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a)
@ -84,6 +108,12 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 	    spl_assert("VERIFY(" #cond ") failed\n",			\
 	    __FILE__, __FUNCTION__, __LINE__))

+#define	VERIFYF(cond, str, ...)		do {				\
+		if (unlikely(!cond))					\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY(" #cond ") failed " str "\n", __VA_ARGS__);\
+	} while (0)
+
 #define	VERIFY3B(LEFT, OP, RIGHT)	do {				\
 		const boolean_t _verify3_left = (boolean_t)(LEFT);	\
 		const boolean_t _verify3_right = (boolean_t)(RIGHT);	\
@ -123,7 +153,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 		if (unlikely(!(_verify3_left OP _verify3_right)))	\
 		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
 		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
-		    "failed (%p " #OP " %p)\n",				\
+		    "failed (%px " #OP " %px)\n",			\
 		    (void *)_verify3_left,				\
 		    (void *)_verify3_right);				\
 	} while (0)
@ -142,10 +172,98 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 		if (unlikely(!(0 == _verify0_right)))			\
 		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
 		    "VERIFY0P(" #RIGHT ") "				\
-		    "failed (NULL == %p)\n",				\
+		    "failed (NULL == %px)\n",				\
 		    (void *)_verify0_right);				\
 	} while (0)

+/*
+ * Note that you should not put any operations you want to always happen
+ * in the print section for ASSERTs unless you only want them to run on
+ * debug builds!
+ * e.g. ASSERT3UF(2, <, 3, "%s", foo(x)), foo(x) won't run on non-debug
+ * builds.
+ */
+
+#define	VERIFY3BF(LEFT, OP, RIGHT, STR, ...)	do {			\
+		const boolean_t _verify3_left = (boolean_t)(LEFT);	\
+		const boolean_t _verify3_right = (boolean_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left OP _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
+		    "failed (%d " #OP " %d) " STR "\n",			\
+		    (boolean_t)(_verify3_left),				\
+		    (boolean_t)(_verify3_right),			\
+		    __VA_ARGS__);					\
+	} while (0)
+
+#define	VERIFY3SF(LEFT, OP, RIGHT, STR, ...)	do {			\
+		const int64_t _verify3_left = (int64_t)(LEFT);		\
+		const int64_t _verify3_right = (int64_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left OP _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
+		    "failed (%lld " #OP " %lld) " STR "\n",		\
+		    (long long)(_verify3_left),				\
+		    (long long)(_verify3_right),			\
+		    __VA_ARGS);						\
+	} while (0)
+
+#define	VERIFY3UF(LEFT, OP, RIGHT, STR, ...)	do {			\
+		const uint64_t _verify3_left = (uint64_t)(LEFT);	\
+		const uint64_t _verify3_right = (uint64_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left OP _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
+		    "failed (%llu " #OP " %llu) " STR "\n",		\
+		    (unsigned long long)(_verify3_left),		\
+		    (unsigned long long)(_verify3_right),		\
+		    __VA_ARGS);						\
+	} while (0)
+
+#define	VERIFY3PF(LEFT, OP, RIGHT, STR, ...)	do {			\
+		const uintptr_t _verify3_left = (uintptr_t)(LEFT);	\
+		const uintptr_t _verify3_right = (uintptr_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left OP _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
+		    "failed (%px " #OP " %px) " STR "\n",		\
+		    (void *) (_verify3_left),				\
+		    (void *) (_verify3_right),				\
+		    __VA_ARGS__);					\
+	} while (0)
+
+#define	VERIFY0PF(RIGHT, STR, ...)	do {				\
+		const uintptr_t _verify3_left = (uintptr_t)(0);		\
+		const uintptr_t _verify3_right = (uintptr_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left == _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY0(0 == " #RIGHT ") "				\
+		    "failed (0 == %px) " STR "\n",			\
+		    (long long) (_verify3_right),			\
+		    __VA_ARGS__);					\
+	} while (0)
+
+#define	VERIFY0F(RIGHT, STR, ...)	do {				\
+		const int64_t _verify3_left = (int64_t)(0);		\
+		const int64_t _verify3_right = (int64_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left == _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY0(0 == " #RIGHT ") "				\
+		    "failed (0 == %lld) " STR "\n",			\
+		    (long long) (_verify3_right),			\
+		    __VA_ARGS__);					\
+	} while (0)
+
+#define	VERIFY_IMPLY(A, B) \
+	((void)(likely((!(A)) || (B)) ||				\
+	    spl_assert("(" #A ") implies (" #B ")",			\
+	    __FILE__, __FUNCTION__, __LINE__)))
+
+#define	VERIFY_EQUIV(A, B) \
+	((void)(likely(!!(A) == !!(B)) || 				\
+	    spl_assert("(" #A ") is equivalent to (" #B ")",		\
+	    __FILE__, __FUNCTION__, __LINE__)))
+
 /*
 * Debugging disabled (--disable-debug)
 */
@ -162,6 +280,13 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 	((void) sizeof ((uintptr_t)(x)), (void) sizeof ((uintptr_t)(z)))
 #define	ASSERT0(x)		((void) sizeof ((uintptr_t)(x)))
 #define	ASSERT0P(x)		((void) sizeof ((uintptr_t)(x)))
+#define	ASSERT3BF(x, y, z, str, ...)	ASSERT3B(x, y, z)
+#define	ASSERT3SF(x, y, z, str, ...)	ASSERT3S(x, y, z)
+#define	ASSERT3UF(x, y, z, str, ...)	ASSERT3U(x, y, z)
+#define	ASSERT3PF(x, y, z, str, ...)	ASSERT3P(x, y, z)
+#define	ASSERT0PF(x, str, ...)		ASSERT0P(x)
+#define	ASSERT0F(x, str, ...)		ASSERT0(x)
+#define	ASSERTF(x, str, ...)		ASSERT(x)
 #define	IMPLY(A, B)							\
 	((void) sizeof ((uintptr_t)(A)), (void) sizeof ((uintptr_t)(B)))
 #define	EQUIV(A, B)		\
@ -178,16 +303,16 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 #define	ASSERT3P	VERIFY3P
 #define	ASSERT0		VERIFY0
 #define	ASSERT0P	VERIFY0P
+#define	ASSERT3BF	VERIFY3BF
+#define	ASSERT3SF	VERIFY3SF
+#define	ASSERT3UF	VERIFY3UF
+#define	ASSERT3PF	VERIFY3PF
+#define	ASSERT0PF	VERIFY0PF
+#define	ASSERT0F	VERIFY0F
+#define	ASSERTF		VERIFYF
 #define	ASSERT		VERIFY
-#define	IMPLY(A, B) \
-	((void)(likely((!(A)) || (B)) ||				\
-	    spl_assert("(" #A ") implies (" #B ")",			\
-	    __FILE__, __FUNCTION__, __LINE__)))
-#define	EQUIV(A, B) \
-	((void)(likely(!!(A) == !!(B)) || 				\
-	    spl_assert("(" #A ") is equivalent to (" #B ")",		\
-	    __FILE__, __FUNCTION__, __LINE__)))
-
+#define	IMPLY		VERIFY_IMPLY
+#define	EQUIV		VERIFY_EQUIV

 #endif /* NDEBUG */

--- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/dkio.h
+++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/dkio.h
@ -1,34 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or https://opensource.org/licenses/CDDL-1.0.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- *
- * $FreeBSD$
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _OPENSOLARIS_SYS_DKIO_H_
-#define	_OPENSOLARIS_SYS_DKIO_H_
-
-#define	DKIOC		(0x04 << 8)
-#define	DKIOCFLUSHWRITECACHE	(DKIOC|34)	/* flush cache to phys medium */
-
-#endif /* _OPENSOLARIS_SYS_DKIO_H_ */
--- a/sys/contrib/openzfs/include/os/linux/Makefile.am
+++ b/sys/contrib/openzfs/include/os/linux/Makefile.am
@ -47,6 +47,7 @@ kernel_sys_HEADERS = \

 kernel_spl_rpcdir = $(kerneldir)/spl/rpc
 kernel_spl_rpc_HEADERS = \
+	%D%/spl/rpc/types.h \
 	%D%/spl/rpc/xdr.h

 kernel_spl_sysdir = $(kerneldir)/spl/sys
@ -62,7 +63,6 @@ kernel_spl_sys_HEADERS = \
 	%D%/spl/sys/ctype.h \
 	%D%/spl/sys/debug.h \
 	%D%/spl/sys/disp.h \
-	%D%/spl/sys/dkio.h \
 	%D%/spl/sys/errno.h \
 	%D%/spl/sys/fcntl.h \
 	%D%/spl/sys/file.h \
--- a/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h
@ -563,9 +563,11 @@ static inline boolean_t
 bdev_discard_supported(struct block_device *bdev)
 {
 #if defined(HAVE_BDEV_MAX_DISCARD_SECTORS)
-	return (!!bdev_max_discard_sectors(bdev));
+	return (bdev_max_discard_sectors(bdev) > 0 &&
+	    bdev_discard_granularity(bdev) > 0);
 #elif defined(HAVE_BLK_QUEUE_DISCARD)
-	return (!!blk_queue_discard(bdev_get_queue(bdev)));
+	return (blk_queue_discard(bdev_get_queue(bdev)) > 0 &&
+	    bdev_get_queue(bdev)->limits.discard_granularity > 0);
 #else
 #error "Unsupported kernel"
 #endif
--- a/sys/contrib/openzfs/include/os/linux/spl/rpc/types.h
+++ b/sys/contrib/openzfs/include/os/linux/spl/rpc/types.h
@ -1,9 +1,6 @@
 /*
- *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
- *  Copyright (C) 2007 The Regents of the University of California.
- *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
- *  UCRL-CODE-235197
+ *  Copyright (c) 2008 Sun Microsystems, Inc.
+ *  Written by Ricardo Correia <Ricardo.M.Correia@Sun.COM>
 *
 *  This file is part of the SPL, Solaris Porting Layer.
 *
@ -21,19 +18,13 @@
 *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef _SPL_DKIO_H
-#define	_SPL_DKIO_H
+#ifndef _SPL_RPC_TYPES_H
+#define	_SPL_RPC_TYPES_H

-#define	DFL_SZ(num_exts) \
-	(sizeof (dkioc_free_list_t) + (num_exts - 1) * 16)
+#include <sys/types.h>

-#define	DKIOC		(0x04 << 8)
-#define	DKIOCFLUSHWRITECACHE	(DKIOC|34)	/* flush cache to phys medium */
+/* Just enough to support rpc/xdr.h */

-/*
- * ioctl to free space (e.g. SCSI UNMAP) off a disk.
- * Pass a dkioc_free_list_t containing a list of extents to be freed.
- */
-#define	DKIOCFREE	(DKIOC|50)
+typedef int bool_t;

-#endif /* _SPL_DKIO_H */
+#endif /* SPL_RPC_TYPES_H */
--- a/sys/contrib/openzfs/include/os/linux/spl/rpc/xdr.h
+++ b/sys/contrib/openzfs/include/os/linux/spl/rpc/xdr.h
@ -23,8 +23,6 @@

 #include <sys/types.h>

-typedef int bool_t;
-
 /*
 * XDR enums and types.
 */
--- a/sys/contrib/openzfs/include/os/linux/spl/sys/debug.h
+++ b/sys/contrib/openzfs/include/os/linux/spl/sys/debug.h
@ -1,24 +1,29 @@
 /*
- *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
- *  Copyright (C) 2007 The Regents of the University of California.
- *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
- *  UCRL-CODE-235197
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
 *
- *  This file is part of the SPL, Solaris Porting Layer.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
 *
- *  The SPL is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License as published by the
- *  Free Software Foundation; either version 2 of the License, or (at your
- *  option) any later version.
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
 *
- *  The SPL is distributed in the hope that it will be useful, but WITHOUT
- *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- *  for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ * $FreeBSD$
 */

 /*
@ -47,10 +52,17 @@
 #ifndef _SPL_DEBUG_H
 #define	_SPL_DEBUG_H

+
 /*
 * Common DEBUG functionality.
 */
+#ifdef __FreeBSD__
+#include <linux/compiler.h>
+#endif
+
+#ifndef __printflike
 #define	__printflike(a, b)	__printf(a, b)
+#endif

 #ifndef __maybe_unused
 #define	__maybe_unused __attribute__((unused))
@ -80,6 +92,14 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 	return (0);
 }

+#ifndef expect
+#define	expect(expr, value) (__builtin_expect((expr), (value)))
+#endif
+#ifndef __linux__
+#define	likely(expr)   expect((expr) != 0, 1)
+#define	unlikely(expr) expect((expr) != 0, 0)
+#endif
+
 #define	PANIC(fmt, a...)						\
 	spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a)

@ -88,6 +108,12 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 	    spl_assert("VERIFY(" #cond ") failed\n",			\
 	    __FILE__, __FUNCTION__, __LINE__))

+#define	VERIFYF(cond, str, ...)		do {				\
+		if (unlikely(!cond))					\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY(" #cond ") failed " str "\n", __VA_ARGS__);\
+	} while (0)
+
 #define	VERIFY3B(LEFT, OP, RIGHT)	do {				\
 		const boolean_t _verify3_left = (boolean_t)(LEFT);	\
 		const boolean_t _verify3_right = (boolean_t)(RIGHT);	\
@ -150,6 +176,84 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 		    (void *)_verify0_right);				\
 	} while (0)

+/*
+ * Note that you should not put any operations you want to always happen
+ * in the print section for ASSERTs unless you only want them to run on
+ * debug builds!
+ * e.g. ASSERT3UF(2, <, 3, "%s", foo(x)), foo(x) won't run on non-debug
+ * builds.
+ */
+
+#define	VERIFY3BF(LEFT, OP, RIGHT, STR, ...)	do {			\
+		const boolean_t _verify3_left = (boolean_t)(LEFT);	\
+		const boolean_t _verify3_right = (boolean_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left OP _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
+		    "failed (%d " #OP " %d) " STR "\n",			\
+		    (boolean_t)(_verify3_left),				\
+		    (boolean_t)(_verify3_right),			\
+		    __VA_ARGS__);					\
+	} while (0)
+
+#define	VERIFY3SF(LEFT, OP, RIGHT, STR, ...)	do {			\
+		const int64_t _verify3_left = (int64_t)(LEFT);		\
+		const int64_t _verify3_right = (int64_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left OP _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
+		    "failed (%lld " #OP " %lld) " STR "\n",		\
+		    (long long)(_verify3_left),				\
+		    (long long)(_verify3_right),			\
+		    __VA_ARGS);						\
+	} while (0)
+
+#define	VERIFY3UF(LEFT, OP, RIGHT, STR, ...)	do {			\
+		const uint64_t _verify3_left = (uint64_t)(LEFT);	\
+		const uint64_t _verify3_right = (uint64_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left OP _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
+		    "failed (%llu " #OP " %llu) " STR "\n",		\
+		    (unsigned long long)(_verify3_left),		\
+		    (unsigned long long)(_verify3_right),		\
+		    __VA_ARGS);						\
+	} while (0)
+
+#define	VERIFY3PF(LEFT, OP, RIGHT, STR, ...)	do {			\
+		const uintptr_t _verify3_left = (uintptr_t)(LEFT);	\
+		const uintptr_t _verify3_right = (uintptr_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left OP _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
+		    "failed (%px " #OP " %px) " STR "\n",		\
+		    (void *) (_verify3_left),				\
+		    (void *) (_verify3_right),				\
+		    __VA_ARGS__);					\
+	} while (0)
+
+#define	VERIFY0PF(RIGHT, STR, ...)	do {				\
+		const uintptr_t _verify3_left = (uintptr_t)(0);		\
+		const uintptr_t _verify3_right = (uintptr_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left == _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY0(0 == " #RIGHT ") "				\
+		    "failed (0 == %px) " STR "\n",			\
+		    (long long) (_verify3_right),			\
+		    __VA_ARGS__);					\
+	} while (0)
+
+#define	VERIFY0F(RIGHT, STR, ...)	do {				\
+		const int64_t _verify3_left = (int64_t)(0);		\
+		const int64_t _verify3_right = (int64_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left == _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY0(0 == " #RIGHT ") "				\
+		    "failed (0 == %lld) " STR "\n",			\
+		    (long long) (_verify3_right),			\
+		    __VA_ARGS__);					\
+	} while (0)
+
 #define	VERIFY_IMPLY(A, B) \
 	((void)(likely((!(A)) || (B)) ||				\
 	    spl_assert("(" #A ") implies (" #B ")",			\
@ -176,6 +280,13 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 	((void) sizeof ((uintptr_t)(x)), (void) sizeof ((uintptr_t)(z)))
 #define	ASSERT0(x)		((void) sizeof ((uintptr_t)(x)))
 #define	ASSERT0P(x)		((void) sizeof ((uintptr_t)(x)))
+#define	ASSERT3BF(x, y, z, str, ...)	ASSERT3B(x, y, z)
+#define	ASSERT3SF(x, y, z, str, ...)	ASSERT3S(x, y, z)
+#define	ASSERT3UF(x, y, z, str, ...)	ASSERT3U(x, y, z)
+#define	ASSERT3PF(x, y, z, str, ...)	ASSERT3P(x, y, z)
+#define	ASSERT0PF(x, str, ...)		ASSERT0P(x)
+#define	ASSERT0F(x, str, ...)		ASSERT0(x)
+#define	ASSERTF(x, str, ...)		ASSERT(x)
 #define	IMPLY(A, B)							\
 	((void) sizeof ((uintptr_t)(A)), (void) sizeof ((uintptr_t)(B)))
 #define	EQUIV(A, B)		\
@ -192,6 +303,13 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 #define	ASSERT3P	VERIFY3P
 #define	ASSERT0		VERIFY0
 #define	ASSERT0P	VERIFY0P
+#define	ASSERT3BF	VERIFY3BF
+#define	ASSERT3SF	VERIFY3SF
+#define	ASSERT3UF	VERIFY3UF
+#define	ASSERT3PF	VERIFY3PF
+#define	ASSERT0PF	VERIFY0PF
+#define	ASSERT0F	VERIFY0F
+#define	ASSERTF		VERIFYF
 #define	ASSERT		VERIFY
 #define	IMPLY		VERIFY_IMPLY
 #define	EQUIV		VERIFY_EQUIV
--- a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_common.h
+++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_common.h
@ -31,7 +31,6 @@
 /* ZIO macros */
 #define	ZIO_TP_STRUCT_ENTRY						\
 		__field(zio_type_t,		zio_type)		\
-		__field(int,			zio_cmd)		\
 		__field(zio_priority_t,		zio_priority)		\
 		__field(uint64_t,		zio_size)		\
 		__field(uint64_t,		zio_orig_size)		\
@ -61,7 +60,6 @@

 #define	ZIO_TP_FAST_ASSIGN						    \
 		__entry->zio_type		= zio->io_type;		    \
-		__entry->zio_cmd		= zio->io_cmd;		    \
 		__entry->zio_priority		= zio->io_priority;	    \
 		__entry->zio_size		= zio->io_size;		    \
 		__entry->zio_orig_size		= zio->io_orig_size;	    \
@ -90,7 +88,7 @@
 		__entry->zp_dedup_verify	= zio->io_prop.zp_dedup_verify;

 #define	ZIO_TP_PRINTK_FMT						\
-	"zio { type %u cmd %i prio %u size %llu orig_size %llu "	\
+	"zio { type %u prio %u size %llu orig_size %llu "		\
 	"offset %llu timestamp %llu delta %llu delay %llu "		\
 	"flags 0x%llx stage 0x%x pipeline 0x%x orig_flags 0x%llx "	\
 	"orig_stage 0x%x orig_pipeline 0x%x reexecute %u "		\
@ -98,7 +96,7 @@
 	"type %u level %u copies %u dedup %u dedup_verify %u nopwrite %u } }"

 #define	ZIO_TP_PRINTK_ARGS						\
-	__entry->zio_type, __entry->zio_cmd, __entry->zio_priority,	\
+	__entry->zio_type, __entry->zio_priority,			\
 	__entry->zio_size, __entry->zio_orig_size, __entry->zio_offset,	\
 	__entry->zio_timestamp, __entry->zio_delta, __entry->zio_delay,	\
 	__entry->zio_flags, __entry->zio_stage, __entry->zio_pipeline,	\
--- a/sys/contrib/openzfs/include/sys/dmu_zfetch.h
+++ b/sys/contrib/openzfs/include/sys/dmu_zfetch.h
@ -45,18 +45,24 @@ typedef struct zfetch {
 	int		zf_numstreams;	/* number of zstream_t's */
 } zfetch_t;

+typedef struct zsrange {
+	uint16_t	start;
+	uint16_t	end;
+} zsrange_t;
+
+#define	ZFETCH_RANGES	9		/* Fits zstream_t into 128 bytes */
+
 typedef struct zstream {
+	list_node_t	zs_node;	/* link for zf_stream */
 	uint64_t	zs_blkid;	/* expect next access at this blkid */
+	uint_t		zs_atime;	/* time last prefetch issued */
+	zsrange_t	zs_ranges[ZFETCH_RANGES]; /* ranges from future */
 	unsigned int	zs_pf_dist;	/* data prefetch distance in bytes */
 	unsigned int	zs_ipf_dist;	/* L1 prefetch distance in bytes */
 	uint64_t	zs_pf_start;	/* first data block to prefetch */
 	uint64_t	zs_pf_end;	/* data block to prefetch up to */
 	uint64_t	zs_ipf_start;	/* first data block to prefetch L1 */
 	uint64_t	zs_ipf_end;	/* data block to prefetch L1 up to */
-
-	list_node_t	zs_node;	/* link for zf_stream */
-	hrtime_t	zs_atime;	/* time last prefetch issued */
-	zfetch_t	*zs_fetch;	/* parent fetch */
 	boolean_t	zs_missed;	/* stream saw cache misses */
 	boolean_t	zs_more;	/* need more distant prefetch */
 	zfs_refcount_t	zs_callers;	/* number of pending callers */
@ -74,7 +80,7 @@ void		dmu_zfetch_init(zfetch_t *, struct dnode *);
 void		dmu_zfetch_fini(zfetch_t *);
 zstream_t	*dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
    boolean_t);
-void		dmu_zfetch_run(zstream_t *, boolean_t, boolean_t);
+void		dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t);
 void		dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
    boolean_t);

--- a/sys/contrib/openzfs/include/sys/fs/zfs.h
+++ b/sys/contrib/openzfs/include/sys/fs/zfs.h
@ -1094,11 +1094,17 @@ typedef enum zio_type {
 	ZIO_TYPE_WRITE,
 	ZIO_TYPE_FREE,
 	ZIO_TYPE_CLAIM,
-	ZIO_TYPE_IOCTL,
+	ZIO_TYPE_FLUSH,
 	ZIO_TYPE_TRIM,
 	ZIO_TYPES
 } zio_type_t;

+/*
+ * Compatibility: _IOCTL was renamed to _FLUSH; keep the old name available to
+ * user programs.
+ */
+#define	ZIO_TYPE_IOCTL	ZIO_TYPE_FLUSH
+
 /*
 * Pool statistics.  Note: all fields should be 64-bit because this
 * is passed between kernel and userland as an nvlist uint64 array.
--- a/sys/contrib/openzfs/include/sys/multilist.h
+++ b/sys/contrib/openzfs/include/sys/multilist.h
@ -82,12 +82,15 @@ int  multilist_is_empty(multilist_t *);
 unsigned int multilist_get_num_sublists(multilist_t *);
 unsigned int multilist_get_random_index(multilist_t *);

-multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
+void multilist_sublist_lock(multilist_sublist_t *);
+multilist_sublist_t *multilist_sublist_lock_idx(multilist_t *, unsigned int);
 multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *);
 void multilist_sublist_unlock(multilist_sublist_t *);

 void multilist_sublist_insert_head(multilist_sublist_t *, void *);
 void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
+void multilist_sublist_insert_after(multilist_sublist_t *, void *, void *);
+void multilist_sublist_insert_before(multilist_sublist_t *, void *, void *);
 void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
 void multilist_sublist_remove(multilist_sublist_t *, void *);
 int  multilist_sublist_is_empty(multilist_sublist_t *);
--- a/sys/contrib/openzfs/include/sys/vdev_impl.h
+++ b/sys/contrib/openzfs/include/sys/vdev_impl.h
@ -35,7 +35,6 @@
 #include <sys/nvpair.h>
 #include <sys/space_map.h>
 #include <sys/vdev.h>
-#include <sys/dkio.h>
 #include <sys/uberblock_impl.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/vdev_indirect_births.h>
@ -455,7 +454,7 @@ struct vdev {
 	zfs_ratelimit_t vdev_checksum_rl;

 	/*
-	 * Vdev properties for tuning ZED
+	 * Vdev properties for tuning ZED or zfsd
 	 */
 	uint64_t	vdev_checksum_n;
 	uint64_t	vdev_checksum_t;
--- a/sys/contrib/openzfs/include/sys/zap_leaf.h
+++ b/sys/contrib/openzfs/include/sys/zap_leaf.h
@ -132,7 +132,7 @@ typedef struct zap_leaf_phys {
 	 * with the ZAP_LEAF_CHUNK() macro.
 	 */

-	uint16_t l_hash[1];
+	uint16_t l_hash[];
 } zap_leaf_phys_t;

 typedef union zap_leaf_chunk {
--- a/sys/contrib/openzfs/include/sys/zio.h
+++ b/sys/contrib/openzfs/include/sys/zio.h
@ -27,7 +27,7 @@
 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
 * Copyright 2016 Toomas Soome <tsoome@me.com>
 * Copyright (c) 2019, Allan Jude
- * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, 2023, 2024, Klara Inc.
 * Copyright (c) 2019-2020, Michael Niewöhner
 */

@ -451,7 +451,6 @@ struct zio {
 	zio_type_t	io_type;
 	enum zio_child	io_child_type;
 	enum trim_flag	io_trim_flags;
-	int		io_cmd;
 	zio_priority_t	io_priority;
 	uint8_t		io_reexecute;
 	uint8_t		io_state[ZIO_WAIT_TYPES];
@ -579,9 +578,6 @@ extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
    const blkptr_t *bp,
    zio_done_func_t *done, void *priv, zio_flag_t flags);

-extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    zio_done_func_t *done, void *priv, zio_flag_t flags);
-
 extern zio_t *zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
    zio_done_func_t *done, void *priv, zio_priority_t priority,
    zio_flag_t flags, enum trim_flag trim_flags);
--- a/sys/contrib/openzfs/include/sys/zio_impl.h
+++ b/sys/contrib/openzfs/include/sys/zio_impl.h
@ -40,7 +40,7 @@ extern "C" {
 *
 * The ZFS I/O pipeline is comprised of various stages which are defined
 * in the zio_stage enum below. The individual stages are used to construct
- * these basic I/O operations: Read, Write, Free, Claim, Ioctl and Trim.
+ * these basic I/O operations: Read, Write, Free, Claim, Flush and Trim.
 *
 * I/O operations: (XXX - provide detail for each of the operations)
 *
@ -48,7 +48,7 @@ extern "C" {
 * Write:
 * Free:
 * Claim:
- * Ioctl:
+ * Flush:
 * Trim:
 *
 * Although the most common pipeline are used by the basic I/O operations
@ -122,7 +122,7 @@ extern "C" {
 * zio pipeline stage definitions
 */
 enum zio_stage {
-	ZIO_STAGE_OPEN			= 1 << 0,	/* RWFCIT */
+	ZIO_STAGE_OPEN			= 1 << 0,	/* RWFCXT */

 	ZIO_STAGE_READ_BP_INIT		= 1 << 1,	/* R----- */
 	ZIO_STAGE_WRITE_BP_INIT		= 1 << 2,	/* -W---- */
@ -150,15 +150,15 @@ enum zio_stage {
 	ZIO_STAGE_DVA_FREE		= 1 << 18,	/* --F--- */
 	ZIO_STAGE_DVA_CLAIM		= 1 << 19,	/* ---C-- */

-	ZIO_STAGE_READY			= 1 << 20,	/* RWFCIT */
+	ZIO_STAGE_READY			= 1 << 20,	/* RWFCXT */

-	ZIO_STAGE_VDEV_IO_START		= 1 << 21,	/* RW--IT */
-	ZIO_STAGE_VDEV_IO_DONE		= 1 << 22,	/* RW---T */
-	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 23,	/* RW--IT */
+	ZIO_STAGE_VDEV_IO_START		= 1 << 21,	/* RW--XT */
+	ZIO_STAGE_VDEV_IO_DONE		= 1 << 22,	/* RW--XT */
+	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 23,	/* RW--XT */

 	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 24,	/* R----- */

-	ZIO_STAGE_DONE			= 1 << 25	/* RWFCIT */
+	ZIO_STAGE_DONE			= 1 << 25	/* RWFCXT */
 };

 #define	ZIO_ROOT_PIPELINE			\
@ -259,10 +259,9 @@ enum zio_stage {
 	(ZIO_INTERLOCK_STAGES |			\
 	ZIO_STAGE_DVA_CLAIM)

-#define	ZIO_IOCTL_PIPELINE			\
+#define	ZIO_FLUSH_PIPELINE			\
 	(ZIO_INTERLOCK_STAGES |			\
-	ZIO_STAGE_VDEV_IO_START |		\
-	ZIO_STAGE_VDEV_IO_ASSESS)
+	ZIO_VDEV_IO_STAGES)

 #define	ZIO_TRIM_PIPELINE			\
 	(ZIO_INTERLOCK_STAGES |			\
--- a/sys/contrib/openzfs/lib/libspl/include/assert.h
+++ b/sys/contrib/openzfs/lib/libspl/include/assert.h
@ -70,6 +70,15 @@ libspl_assert(const char *buf, const char *file, const char *func, int line)
 #define	VERIFY(cond)							\
 	(void) ((!(cond)) &&						\
 	    libspl_assert(#cond, __FILE__, __FUNCTION__, __LINE__))
+
+#define	VERIFYF(cond, STR, ...)						\
+do {									\
+	if (!(cond))							\
+		libspl_assertf(__FILE__, __FUNCTION__, __LINE__,	\
+		    "%s " STR, #cond,					\
+		    __VA_ARGS__);					\
+} while (0)
+
 #define	verify(cond)							\
 	(void) ((!(cond)) &&						\
 	    libspl_assert(#cond, __FILE__, __FUNCTION__, __LINE__))
@ -132,6 +141,79 @@ do {									\
 		    (void *)__left);					\
 } while (0)

+/*
+ * This is just here because cstyle gets upset about #LEFT
+ * on a newline.
+ */
+
+/* BEGIN CSTYLED */
+#define	VERIFY3BF(LEFT, OP, RIGHT, STR, ...)				\
+do {									\
+	const boolean_t __left = (boolean_t)(LEFT);			\
+	const boolean_t __right = (boolean_t)(RIGHT);			\
+	if (!(__left OP __right))					\
+		libspl_assertf(__FILE__, __FUNCTION__, __LINE__,	\
+		    "%s %s %s (0x%llx %s 0x%llx) " STR,			\
+		    #LEFT, #OP, #RIGHT,					\
+		    (u_longlong_t)__left, #OP, (u_longlong_t)__right,	\
+		    __VA_ARGS__);					\
+} while (0)
+
+#define	VERIFY3SF(LEFT, OP, RIGHT, STR, ...)				\
+do {									\
+	const int64_t __left = (int64_t)(LEFT);				\
+	const int64_t __right = (int64_t)(RIGHT);			\
+	if (!(__left OP __right))					\
+		libspl_assertf(__FILE__, __FUNCTION__, __LINE__,	\
+		    "%s %s %s (0x%llx %s 0x%llx) " STR,			\
+		    #LEFT, #OP, #RIGHT,					\
+		    (u_longlong_t)__left, #OP, (u_longlong_t)__right,	\
+		    __VA_ARGS__);					\
+} while (0)
+
+#define	VERIFY3UF(LEFT, OP, RIGHT, STR, ...)				\
+do {									\
+	const uint64_t __left = (uint64_t)(LEFT);			\
+	const uint64_t __right = (uint64_t)(RIGHT);			\
+	if (!(__left OP __right))					\
+		libspl_assertf(__FILE__, __FUNCTION__, __LINE__,	\
+		    "%s %s %s (0x%llx %s 0x%llx) " STR,			\
+		    #LEFT, #OP, #RIGHT,					\
+		    (u_longlong_t)__left, #OP, (u_longlong_t)__right,	\
+		    __VA_ARGS__);					\
+} while (0)
+
+#define	VERIFY3PF(LEFT, OP, RIGHT, STR, ...)				\
+do {									\
+	const uintptr_t __left = (uintptr_t)(LEFT);			\
+	const uintptr_t __right = (uintptr_t)(RIGHT);			\
+	if (!(__left OP __right))					\
+		libspl_assertf(__FILE__, __FUNCTION__, __LINE__,	\
+		    "%s %s %s (0x%llx %s 0x%llx) " STR,			\
+		    #LEFT, #OP, #RIGHT,					\
+		    (u_longlong_t)__left, #OP, (u_longlong_t)__right,	\
+		    __VA_ARGS__);					\
+} while (0)
+/* END CSTYLED */
+
+#define	VERIFY0F(LEFT, STR, ...)					\
+do {									\
+	const uint64_t __left = (uint64_t)(LEFT);			\
+	if (!(__left == 0))						\
+		libspl_assertf(__FILE__, __FUNCTION__, __LINE__,	\
+		    "%s == 0 (0x%llx == 0) " STR, #LEFT,		\
+		    (u_longlong_t)__left, __VA_ARGS__);			\
+} while (0)
+
+#define	VERIFY0PF(LEFT, STR, ...)					\
+do {									\
+	const uintptr_t __left = (uintptr_t)(LEFT);			\
+	if (!(__left == 0))						\
+		libspl_assertf(__FILE__, __FUNCTION__, __LINE__,	\
+		    "%s == 0 (%p == 0) " STR, #LEFT,			\
+		    (u_longlong_t)__left, __VA_ARGS__);			\
+} while (0)
+
 #ifdef assert
 #undef assert
 #endif
@ -147,7 +229,15 @@ do {									\
 	((void) sizeof ((uintptr_t)(x)), (void) sizeof ((uintptr_t)(z)))
 #define	ASSERT0(x)		((void) sizeof ((uintptr_t)(x)))
 #define	ASSERT0P(x)		((void) sizeof ((uintptr_t)(x)))
+#define	ASSERT3BF(x, y, z, str, ...)	ASSERT3B(x, y, z)
+#define	ASSERT3SF(x, y, z, str, ...)	ASSERT3S(x, y, z)
+#define	ASSERT3UF(x, y, z, str, ...)	ASSERT3U(x, y, z)
+#define	ASSERT3PF(x, y, z, str, ...)	ASSERT3P(x, y, z)
+#define	ASSERT0P(x)		((void) sizeof ((uintptr_t)(x)))
+#define	ASSERT0PF(x, str, ...)		ASSERT0P(x)
+#define	ASSERT0F(x, str, ...)		ASSERT0(x)
 #define	ASSERT(x)		((void) sizeof ((uintptr_t)(x)))
+#define	ASSERTF(x, str, ...)	ASSERT(x)
 #define	assert(x)		((void) sizeof ((uintptr_t)(x)))
 #define	IMPLY(A, B)							\
 	((void) sizeof ((uintptr_t)(A)), (void) sizeof ((uintptr_t)(B)))
@ -160,7 +250,14 @@ do {									\
 #define	ASSERT3P	VERIFY3P
 #define	ASSERT0		VERIFY0
 #define	ASSERT0P	VERIFY0P
+#define	ASSERT3BF	VERIFY3BF
+#define	ASSERT3SF	VERIFY3SF
+#define	ASSERT3UF	VERIFY3UF
+#define	ASSERT3PF	VERIFY3PF
+#define	ASSERT0PF	VERIFY0PF
+#define	ASSERT0F	VERIFY0F
 #define	ASSERT		VERIFY
+#define	ASSERTF		VERIFYF
 #define	assert		VERIFY
 #define	IMPLY(A, B) \
 	((void)(((!(A)) || (B)) || \
--- a/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c
+++ b/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c
@ -1900,7 +1900,8 @@ zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun,
 	(void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);

 	if (localtime_r((time_t *)&rewindto, &t) != NULL &&
-	    strftime(timestr, 128, "%c", &t) != 0) {
+	    ctime_r((time_t *)&rewindto, timestr) != NULL) {
+		timestr[24] = 0;
 		if (dryrun) {
 			(void) printf(dgettext(TEXT_DOMAIN,
 			    "Would be able to return %s "
@ -1962,7 +1963,8 @@ zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason,
 	    "Recovery is possible, but will result in some data loss.\n"));

 	if (localtime_r((time_t *)&rewindto, &t) != NULL &&
-	    strftime(timestr, 128, "%c", &t) != 0) {
+	    ctime_r((time_t *)&rewindto, timestr) != NULL) {
+		timestr[24] = 0;
 		(void) printf(dgettext(TEXT_DOMAIN,
 		    "\tReturning the pool to its state as of %s\n"
 		    "\tshould correct the problem.  "),
--- a/sys/contrib/openzfs/lib/libzfs/libzfs_sendrecv.c
+++ b/sys/contrib/openzfs/lib/libzfs/libzfs_sendrecv.c
@ -1053,6 +1053,7 @@ send_progress_thread(void *arg)
 		}
 	}
 	pthread_cleanup_pop(B_TRUE);
+	return (NULL);
 }

 static boolean_t
--- a/sys/contrib/openzfs/man/Makefile.am
+++ b/sys/contrib/openzfs/man/Makefile.am
@ -62,7 +62,6 @@ dist_man_MANS = \
 	%D%/man8/zfs-userspace.8 \
 	%D%/man8/zfs-wait.8 \
 	%D%/man8/zfs_ids_to_path.8 \
-	%D%/man8/zfs_prepare_disk.8 \
 	%D%/man8/zgenhostid.8 \
 	%D%/man8/zinject.8 \
 	%D%/man8/zpool.8 \
@ -115,7 +114,8 @@ endif

 nodist_man_MANS = \
 	%D%/man8/zed.8 \
-	%D%/man8/zfs-mount-generator.8
+	%D%/man8/zfs-mount-generator.8 \
+	%D%/man8/zfs_prepare_disk.8

 dist_noinst_DATA += $(dist_noinst_man_MANS) $(dist_man_MANS)

--- a/sys/contrib/openzfs/man/man4/zfs.4
+++ b/sys/contrib/openzfs/man/man4/zfs.4
@ -564,6 +564,10 @@ However, this is limited by
 Maximum micro ZAP size.
 A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
 .
+.It Sy zfetch_hole_shift Ns = Ns Sy 2 Pq uint
+Log2 fraction of holes in speculative prefetch stream allowed for it to
+proceed.
+.
 .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
 Min bytes to prefetch per stream.
 Prefetch distance starts from the demand access size and quickly grows to
@ -578,6 +582,13 @@ Max bytes to prefetch per stream.
 .It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint
 Max bytes to prefetch indirects for per stream.
 .
+.It Sy zfetch_max_reorder Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint
+Requests within this byte distance from the current prefetch stream position
+are considered parts of the stream, reordered due to parallel processing.
+Such requests do not advance the stream position immediately unless
+.Sy zfetch_hole_shift
+fill threshold is reached, but saved to fill holes in the stream later.
+.
 .It Sy zfetch_max_streams Ns = Ns Sy 8 Pq uint
 Max number of streams per zfetch (prefetch streams per file).
 .
@ -2387,6 +2398,13 @@ The number of requests which can be handled concurrently is controlled by
 is ignored when running on a kernel that supports block multiqueue
 .Pq Li blk-mq .
 .
+.It Sy zvol_num_taskqs Ns = Ns Sy 0 Pq uint
+Number of zvol taskqs.
+If
+.Sy 0
+(the default) then scaling is done internally to prefer 6 threads per taskq.
+This only applies on Linux.
+.
 .It Sy zvol_threads Ns = Ns Sy 0 Pq uint
 The number of system wide threads to use for processing zvol block IOs.
 If
--- a/sys/contrib/openzfs/man/man7/vdevprops.7
+++ b/sys/contrib/openzfs/man/man7/vdevprops.7
@ -127,7 +127,13 @@ If the property is only set on the top-level vdev, this value will be used.
 The value of these properties do not persist across vdev replacement.
 For this reason, it is advisable to set the property on the top-level vdev -
 not on the leaf vdev itself.
-The default values are 10 errors in 600 seconds.
+The default values for
+.Sy OpenZFS on Linux
+are 10 errors in 600 seconds.
+For
+.Sy OpenZFS on FreeBSD
+defaults see
+.Xr zfsd 8 .
 .It Sy comment
 A text comment up to 8192 characters long
 .It Sy bootsize
--- a/sys/contrib/openzfs/man/man8/zfs-mount.8
+++ b/sys/contrib/openzfs/man/man8/zfs-mount.8
@ -43,7 +43,7 @@
 .Cm mount
 .Op Fl Oflv
 .Op Fl o Ar options
-.Fl a Ns | Ns Ar filesystem
+.Fl a Ns | Ns Fl R Ar filesystem Ns | Ns Ar filesystem
 .Nm zfs
 .Cm unmount
 .Op Fl fu
@ -61,7 +61,7 @@ Displays all ZFS file systems currently mounted.
 .Cm mount
 .Op Fl Oflv
 .Op Fl o Ar options
-.Fl a Ns | Ns Ar filesystem
+.Fl a Ns | Ns Fl R Ar filesystem Ns | Ns Ar filesystem
 .Xc
 Mount ZFS filesystem on a path described by its
 .Sy mountpoint
@ -83,6 +83,8 @@ for more information.
 .It Fl a
 Mount all available ZFS file systems.
 Invoked automatically as part of the boot process if configured.
+.It Fl R
+Mount the specified filesystems along with all their children.
 .It Ar filesystem
 Mount the specified filesystem.
 .It Fl o Ar options
--- a/sys/contrib/openzfs/man/man8/zinject.8
+++ b/sys/contrib/openzfs/man/man8/zinject.8
@ -19,10 +19,11 @@
 .\" CDDL HEADER END
 .\"
 .\" Copyright 2013 Darik Horn <dajhorn@vanadac.com>. All rights reserved.
+.\" Copyright (c) 2024, Klara Inc.
 .\"
 .\" lint-ok: WARNING: sections out of conventional order: Sh SYNOPSIS
 .\"
-.Dd May 26, 2021
+.Dd April 4, 2024
 .Dt ZINJECT 8
 .Os
 .
@ -210,9 +211,11 @@ to flip a bit in the data after a read,
 .It Sy dtl
 for an ECHILD error,
 .It Sy io
-for an EIO error where reopening the device will succeed, or
+for an EIO error where reopening the device will succeed,
 .It Sy nxio
-for an ENXIO error where reopening the device will fail.
+for an ENXIO error where reopening the device will fail, or
+.It Sy noop
+to drop the IO without executing it, and return success.
 .El
 .Pp
 For EIO and ENXIO, the "failed" reads or writes still occur.
@ -257,6 +260,7 @@ Run for this many seconds before reporting failure.
 .It Fl T Ar failure
 Set the failure type to one of
 .Sy all ,
+.Sy ioctl ,
 .Sy claim ,
 .Sy free ,
 .Sy read ,
--- a/sys/contrib/openzfs/man/man8/zpool-events.8
+++ b/sys/contrib/openzfs/man/man8/zpool-events.8
@ -364,7 +364,7 @@ that is, the bits set in the good data which are cleared in the bad data.
 .Sh I/O STAGES
 The ZFS I/O pipeline is comprised of various stages which are defined below.
 The individual stages are used to construct these basic I/O
-operations: Read, Write, Free, Claim, Ioctl and Trim.
+operations: Read, Write, Free, Claim, Flush and Trim.
 These stages may be
 set on an event to describe the life cycle of a given I/O request.
 .Pp
@ -373,7 +373,7 @@ tab(:);
 l l l .
 Stage:Bit Mask:Operations
 _:_:_
-ZIO_STAGE_OPEN:0x00000001:RWFCIT
+ZIO_STAGE_OPEN:0x00000001:RWFCXT

 ZIO_STAGE_READ_BP_INIT:0x00000002:R-----
 ZIO_STAGE_WRITE_BP_INIT:0x00000004:-W----
@ -403,13 +403,13 @@ ZIO_STAGE_DVA_CLAIM:0x00080000:---C--

 ZIO_STAGE_READY:0x00100000:RWFCIT

-ZIO_STAGE_VDEV_IO_START:0x00200000:RW--IT
-ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW---T
-ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--IT
+ZIO_STAGE_VDEV_IO_START:0x00200000:RW--XT
+ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW--XT
+ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--XT

 ZIO_STAGE_CHECKSUM_VERIFY:0x01000000:R-----

-ZIO_STAGE_DONE:0x02000000:RWFCIT
+ZIO_STAGE_DONE:0x02000000:RWFCXT
 .TE
 .
 .Sh I/O FLAGS
--- a/sys/contrib/openzfs/module/Makefile.bsd
+++ b/sys/contrib/openzfs/module/Makefile.bsd
@ -82,12 +82,9 @@ CFLAGS+= -DBITS_PER_LONG=64

 SRCS=	vnode_if.h device_if.h bus_if.h

-# avl
+#avl
 SRCS+=	avl.c

-# icp
-SRCS+=	edonr.c
-
 #icp/algs/blake3
 SRCS+=	blake3.c \
 	blake3_generic.c \
@ -107,9 +104,12 @@ SRCS+=	blake3_avx2.S \
 	blake3_sse2.S \
 	blake3_sse41.S

+#icp/algs/edonr
+SRCS+=	edonr.c
+
 #icp/algs/sha2
-SRCS+=	sha2_generic.c \
-	sha256_impl.c \
+SRCS+=	sha256_impl.c \
+	sha2_generic.c \
 	sha512_impl.c

 #icp/asm-arm/sha2
@ -122,8 +122,8 @@ SRCS+=	sha256-armv8.S \

 #icp/asm-ppc64/sha2
 SRCS+=	sha256-p8.S \
-	sha512-p8.S \
 	sha256-ppc.S \
+	sha512-p8.S \
 	sha512-ppc.S

 #icp/asm-x86_64/sha2
@ -157,10 +157,10 @@ SRCS+=	lapi.c \
 	lzio.c

 #nvpair
-SRCS+=	nvpair.c \
-	fnvpair.c \
-	nvpair_alloc_spl.c \
-	nvpair_alloc_fixed.c
+SRCS+=	fnvpair.c \
+	nvpair.c \
+	nvpair_alloc_fixed.c \
+	nvpair_alloc_spl.c

 #os/freebsd/spl
 SRCS+=	acl_common.c \
@ -184,7 +184,6 @@ SRCS+=	acl_common.c \
 	spl_zlib.c \
 	spl_zone.c

-
 .if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \
 	${MACHINE_ARCH} == "powerpcspe" || ${MACHINE_ARCH} == "arm"
 SRCS+= spl_atomic.c
@ -207,6 +206,7 @@ SRCS+=	abd_os.c \
 	zfs_ctldir.c \
 	zfs_debug.c \
 	zfs_dir.c \
+	zfs_file_os.c \
 	zfs_ioctl_compat.c \
 	zfs_ioctl_os.c \
 	zfs_racct.c \
@ -217,19 +217,20 @@ SRCS+=	abd_os.c \
 	zvol_os.c

 #unicode
-SRCS+=	uconv.c \
-	u8_textprep.c
+SRCS+= 	u8_textprep.c \
+	uconv.c

 #zcommon
-SRCS+=	zfeature_common.c \
+SRCS+=	cityhash.c \
+	zfeature_common.c \
 	zfs_comutil.c \
 	zfs_deleg.c \
-	zfs_fletcher.c \
 	zfs_fletcher_avx512.c \
+	zfs_fletcher.c \
 	zfs_fletcher_intel.c \
 	zfs_fletcher_sse.c \
-	zfs_fletcher_superscalar.c \
 	zfs_fletcher_superscalar4.c \
+	zfs_fletcher_superscalar.c \
 	zfs_namecheck.c \
 	zfs_prop.c \
 	zpool_prop.c \
@ -243,14 +244,13 @@ SRCS+=	abd.c \
 	blkptr.c \
 	bplist.c \
 	bpobj.c \
-	brt.c \
-	btree.c \
-	cityhash.c \
-	dbuf.c \
-	dbuf_stats.c \
 	bptree.c \
 	bqueue.c \
+	brt.c \
+	btree.c \
 	dataset_kstats.c \
+	dbuf.c \
+	dbuf_stats.c \
 	ddt.c \
 	ddt_stats.c \
 	ddt_zap.c \
@ -266,13 +266,13 @@ SRCS+=	abd.c \
 	dmu_zfetch.c \
 	dnode.c \
 	dnode_sync.c \
+	dsl_bookmark.c \
+	dsl_crypt.c \
 	dsl_dataset.c \
 	dsl_deadlist.c \
 	dsl_deleg.c \
-	dsl_bookmark.c \
-	dsl_dir.c \
-	dsl_crypt.c \
 	dsl_destroy.c \
+	dsl_dir.c \
 	dsl_pool.c \
 	dsl_prop.c \
 	dsl_scan.c \
@ -281,9 +281,9 @@ SRCS+=	abd.c \
 	edonr_zfs.c \
 	fm.c \
 	gzip.c \
-	lzjb.c \
 	lz4.c \
 	lz4_zfs.c \
+	lzjb.c \
 	metaslab.c \
 	mmp.c \
 	multilist.c \
@ -296,6 +296,8 @@ SRCS+=	abd.c \
 	sha2_zfs.c \
 	skein_zfs.c \
 	spa.c \
+	space_map.c \
+	space_reftree.c \
 	spa_checkpoint.c \
 	spa_config.c \
 	spa_errlog.c \
@ -303,16 +305,14 @@ SRCS+=	abd.c \
 	spa_log_spacemap.c \
 	spa_misc.c \
 	spa_stats.c \
-	space_map.c \
-	space_reftree.c \
 	txg.c \
 	uberblock.c \
 	unique.c \
 	vdev.c \
 	vdev_draid.c \
 	vdev_draid_rand.c \
-	vdev_indirect.c \
 	vdev_indirect_births.c \
+	vdev_indirect.c \
 	vdev_indirect_mapping.c \
 	vdev_initialize.c \
 	vdev_label.c \
@ -320,11 +320,11 @@ SRCS+=	abd.c \
 	vdev_missing.c \
 	vdev_queue.c \
 	vdev_raidz.c \
-	vdev_raidz_math.c \
-	vdev_raidz_math_scalar.c \
 	vdev_raidz_math_avx2.c \
 	vdev_raidz_math_avx512bw.c \
 	vdev_raidz_math_avx512f.c \
+	vdev_raidz_math.c \
+	vdev_raidz_math_scalar.c \
 	vdev_raidz_math_sse2.c \
 	vdev_raidz_math_ssse3.c \
 	vdev_rebuild.c \
@ -343,7 +343,6 @@ SRCS+=	abd.c \
 	zfeature.c \
 	zfs_byteswap.c \
 	zfs_chksum.c \
-	zfs_file_os.c \
 	zfs_fm.c \
 	zfs_fuid.c \
 	zfs_impl.c \
@ -367,30 +366,36 @@ SRCS+=	abd.c \
 	zvol.c

 #zstd
-SRCS+=	zfs_zstd.c \
-	entropy_common.c \
+SRCS+=	zfs_zstd.c
+
+#zstd/common
+SRCS+=	entropy_common.c \
 	error_private.c \
-	fse_compress.c \
 	fse_decompress.c \
-	hist.c \
-	huf_compress.c \
-	huf_decompress.c \
 	pool.c \
 	xxhash.c \
 	zstd_common.c \
+
+#zstd/compress
+SRCS+=	fse_compress.c \
+	hist.c \
+	huf_compress.c \
 	zstd_compress.c \
 	zstd_compress_literals.c \
 	zstd_compress_sequences.c \
 	zstd_compress_superblock.c \
-	zstd_ddict.c \
-	zstd_decompress.c \
-	zstd_decompress_block.c \
 	zstd_double_fast.c \
 	zstd_fast.c \
 	zstd_lazy.c \
 	zstd_ldm.c \
 	zstd_opt.c

+#zstd/decompress
+SRCS+=	huf_decompress.c \
+	zstd_ddict.c \
+	zstd_decompress_block.c \
+	zstd_decompress.c
+
 beforeinstall:
 .if ${MK_DEBUG_FILES} != "no"
 	mtree -eu \
--- a/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
+++ b/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
@ -32,6 +32,14 @@
 */

 #if defined(__aarch64__)
+
+/* make gcc <= 9 happy */
+#if LD_VERSION >= 233010000
+#define CFI_NEGATE_RA_STATE .cfi_negate_ra_state
+#else
+#define CFI_NEGATE_RA_STATE
+#endif
+
 	.text
 	.section	.note.gnu.property,"a",@note
 	.p2align	3
@ -51,7 +59,7 @@
 zfs_blake3_compress_in_place_sse2:
 	.cfi_startproc
 	hint	#25
-	.cfi_negate_ra_state
+	CFI_NEGATE_RA_STATE
 	sub	sp, sp, #96
 	stp	x29, x30, [sp, #64]
 	add	x29, sp, #64
@ -555,7 +563,7 @@ compress_pre:
 zfs_blake3_compress_xof_sse2:
 	.cfi_startproc
 	hint	#25
-	.cfi_negate_ra_state
+	CFI_NEGATE_RA_STATE
 	sub	sp, sp, #96
 	stp	x29, x30, [sp, #64]
 	add	x29, sp, #64
@ -608,7 +616,7 @@ zfs_blake3_compress_xof_sse2:
 zfs_blake3_hash_many_sse2:
 	.cfi_startproc
 	hint	#25
-	.cfi_negate_ra_state
+	CFI_NEGATE_RA_STATE
 	stp	d15, d14, [sp, #-160]!
 	stp	d13, d12, [sp, #16]
 	stp	d11, d10, [sp, #32]
--- a/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
+++ b/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
@ -32,6 +32,14 @@
 */

 #if defined(__aarch64__)
+
+/* make gcc <= 9 happy */
+#if LD_VERSION >= 233010000
+#define CFI_NEGATE_RA_STATE .cfi_negate_ra_state
+#else
+#define CFI_NEGATE_RA_STATE
+#endif
+
 	.text
 	.section	.note.gnu.property,"a",@note
 	.p2align	3
@ -51,7 +59,7 @@
 zfs_blake3_compress_in_place_sse41:
 	.cfi_startproc
 	hint	#25
-	.cfi_negate_ra_state
+	CFI_NEGATE_RA_STATE
 	sub	sp, sp, #96
 	stp	x29, x30, [sp, #64]
 	add	x29, sp, #64
@ -565,7 +573,7 @@ compress_pre:
 zfs_blake3_compress_xof_sse41:
 	.cfi_startproc
 	hint	#25
-	.cfi_negate_ra_state
+	CFI_NEGATE_RA_STATE
 	sub	sp, sp, #96
 	stp	x29, x30, [sp, #64]
 	add	x29, sp, #64
--- a/sys/contrib/openzfs/module/icp/asm-aarch64/sha2/sha256-armv8.S
+++ b/sys/contrib/openzfs/module/icp/asm-aarch64/sha2/sha256-armv8.S
@ -21,6 +21,16 @@

 #if defined(__aarch64__)

+	.section	.note.gnu.property,"a",@note
+	.p2align	3
+	.word	4
+	.word	16
+	.word	5
+	.asciz	"GNU"
+	.word	3221225472
+	.word	4
+	.word	3
+	.word	0
 .text

 .align	6
--- a/sys/contrib/openzfs/module/icp/asm-aarch64/sha2/sha512-armv8.S
+++ b/sys/contrib/openzfs/module/icp/asm-aarch64/sha2/sha512-armv8.S
@ -21,6 +21,16 @@

 #if defined(__aarch64__)

+	.section	.note.gnu.property,"a",@note
+	.p2align	3
+	.word	4
+	.word	16
+	.word	5
+	.asciz	"GNU"
+	.word	3221225472
+	.word	4
+	.word	3
+	.word	0
 .text

 .align	6
--- a/sys/contrib/openzfs/module/nvpair/nvpair.c
+++ b/sys/contrib/openzfs/module/nvpair/nvpair.c
@ -41,6 +41,7 @@
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/string.h>
+#include <rpc/types.h>
 #include <rpc/xdr.h>
 #include <sys/mod.h>

--- a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c
@ -247,7 +247,7 @@ vdev_file_io_start(zio_t *zio)
 	vdev_t *vd = zio->io_vd;
 	vdev_file_t *vf = vd->vdev_tsd;

-	if (zio->io_type == ZIO_TYPE_IOCTL) {
+	if (zio->io_type == ZIO_TYPE_FLUSH) {
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = SET_ERROR(ENXIO);
@ -255,14 +255,7 @@ vdev_file_io_start(zio_t *zio)
 			return;
 		}

-		switch (zio->io_cmd) {
-		case DKIOCFLUSHWRITECACHE:
-			zio->io_error = zfs_file_fsync(vf->vf_file,
-			    O_SYNC|O_DSYNC);
-			break;
-		default:
-			zio->io_error = SET_ERROR(ENOTSUP);
-		}
+		zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC|O_DSYNC);

 		zio_execute(zio);
 		return;
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
@ -1053,7 +1053,7 @@ vdev_geom_io_intr(struct bio *bp)
 	/*
 	 * We have to split bio freeing into two parts, because the ABD code
 	 * cannot be called in this context and vdev_op_io_done is not called
-	 * for ZIO_TYPE_IOCTL zio-s.
+	 * for ZIO_TYPE_FLUSH zio-s.
 	 */
 	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
 		g_destroy_bio(bp);
@ -1153,46 +1153,35 @@ vdev_geom_io_start(zio_t *zio)

 	vd = zio->io_vd;

-	switch (zio->io_type) {
-	case ZIO_TYPE_IOCTL:
+	if (zio->io_type == ZIO_TYPE_FLUSH) {
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = SET_ERROR(ENXIO);
 			zio_interrupt(zio);
 			return;
-		} else {
-			switch (zio->io_cmd) {
-			case DKIOCFLUSHWRITECACHE:
-				if (zfs_nocacheflush ||
-				    vdev_geom_bio_flush_disable)
-					break;
-				if (vd->vdev_nowritecache) {
-					zio->io_error = SET_ERROR(ENOTSUP);
-					break;
-				}
-				goto sendreq;
-			default:
-				zio->io_error = SET_ERROR(ENOTSUP);
-			}
 		}

-		zio_execute(zio);
-		return;
-	case ZIO_TYPE_TRIM:
-		if (!vdev_geom_bio_delete_disable) {
-			goto sendreq;
+		if (zfs_nocacheflush || vdev_geom_bio_flush_disable) {
+			zio_execute(zio);
+			return;
+		}
+
+		if (vd->vdev_nowritecache) {
+			zio->io_error = SET_ERROR(ENOTSUP);
+			zio_execute(zio);
+			return;
+		}
+	} else if (zio->io_type == ZIO_TYPE_TRIM) {
+		if (vdev_geom_bio_delete_disable) {
+			zio_execute(zio);
+			return;
 		}
-		zio_execute(zio);
-		return;
-	default:
-			;
-		/* PASSTHROUGH --- placate compiler */
 	}
-sendreq:
+
 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE ||
 	    zio->io_type == ZIO_TYPE_TRIM ||
-	    zio->io_type == ZIO_TYPE_IOCTL);
+	    zio->io_type == ZIO_TYPE_FLUSH);

 	cp = vd->vdev_tsd;
 	if (cp == NULL) {
@ -1244,7 +1233,7 @@ vdev_geom_io_start(zio_t *zio)
 		bp->bio_offset = zio->io_offset;
 		bp->bio_length = zio->io_size;
 		break;
-	case ZIO_TYPE_IOCTL:
+	case ZIO_TYPE_FLUSH:
 		bp->bio_cmd = BIO_FLUSH;
 		bp->bio_data = NULL;
 		bp->bio_offset = cp->provider->mediasize;
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c
@ -25,6 +25,7 @@
 #include <sys/debug.h>
 #include <sys/types.h>
 #include <sys/sysmacros.h>
+#include <rpc/types.h>
 #include <rpc/xdr.h>

 /*
--- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
@ -45,15 +45,25 @@
 /*
 * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying
 * block_device. Since it carries the block_device inside, its convenient to
- * just use the handle as a proxy. For pre-6.8, we just emulate this with
- * a cast, since we don't need any of the other fields inside the handle.
+ * just use the handle as a proxy.
+ *
+ * Linux 6.9.x uses a file for the same purpose.
+ *
+ * For pre-6.8, we just emulate this with a cast, since we don't need any of
+ * the other fields inside the handle.
 */
-#ifdef HAVE_BDEV_OPEN_BY_PATH
+#if defined(HAVE_BDEV_OPEN_BY_PATH)
 typedef struct bdev_handle zfs_bdev_handle_t;
 #define	BDH_BDEV(bdh)		((bdh)->bdev)
 #define	BDH_IS_ERR(bdh)		(IS_ERR(bdh))
 #define	BDH_PTR_ERR(bdh)	(PTR_ERR(bdh))
 #define	BDH_ERR_PTR(err)	(ERR_PTR(err))
+#elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
+typedef struct file zfs_bdev_handle_t;
+#define	BDH_BDEV(bdh)		(file_bdev(bdh))
+#define	BDH_IS_ERR(bdh)		(IS_ERR(bdh))
+#define	BDH_PTR_ERR(bdh)	(PTR_ERR(bdh))
+#define	BDH_ERR_PTR(err)	(ERR_PTR(err))
 #else
 typedef void zfs_bdev_handle_t;
 #define	BDH_BDEV(bdh)		((struct block_device *)bdh)
@ -242,7 +252,9 @@ vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder)
 {
 	vdev_bdev_mode_t bmode = vdev_bdev_mode(smode);

-#if defined(HAVE_BDEV_OPEN_BY_PATH)
+#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
+	return (bdev_file_open_by_path(path, bmode, holder, NULL));
+#elif defined(HAVE_BDEV_OPEN_BY_PATH)
 	return (bdev_open_by_path(path, bmode, holder, NULL));
 #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG)
 	return (blkdev_get_by_path(path, bmode, holder, NULL));
@ -258,8 +270,10 @@ vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder)
 	return (bdev_release(bdh));
 #elif defined(HAVE_BLKDEV_PUT_HOLDER)
 	return (blkdev_put(BDH_BDEV(bdh), holder));
-#else
+#elif defined(HAVE_BLKDEV_PUT)
 	return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode)));
+#else
+	fput(bdh);
 #endif
 }

@ -755,8 +769,6 @@ vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
 static void
 vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 {
-	ASSERT(vbio->vbio_bdev);
-
 	/*
 	 * We plug so we can submit the BIOs as we go and only unplug them when
 	 * they are fully created and submitted. This is important; if we don't
@ -774,12 +786,15 @@ vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 	vbio->vbio_bio->bi_end_io = vbio_completion;
 	vbio->vbio_bio->bi_private = vbio;

+	/*
+	 * Once submitted, vbio_bio now owns vbio (through bi_private) and we
+	 * can't touch it again. The bio may complete and vbio_completion() be
+	 * called and free the vbio before this task is run again, so we must
+	 * consider it invalid from this point.
+	 */
 	vdev_submit_bio(vbio->vbio_bio);

 	blk_finish_plug(&plug);
-
-	vbio->vbio_bio = NULL;
-	vbio->vbio_bdev = NULL;
 }

 /* IO completion callback */
@ -838,6 +853,11 @@ BIO_END_IO_PROTO(vbio_completion, bio, error)
 * pages) but we still have to ensure the data portion is correctly sized and
 * aligned to the logical block size, to ensure that if the kernel wants to
 * split the BIO, the two halves will still be properly aligned.
+ *
+ * NOTE: if you change this function, change the copy in
+ * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test
+ * data there to validate the change you're making.
+ *
 */
 typedef struct {
 	uint_t  bmask;
@ -848,6 +868,7 @@ typedef struct {
 static int
 vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
 {
+	(void) page;
 	vdev_disk_check_pages_t *s = priv;

 	/*
@ -861,7 +882,7 @@ vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
 	 * Note if we're taking less than a full block, so we can check it
 	 * above on the next call.
 	 */
-	s->end = len & s->bmask;
+	s->end = (off+len) & s->bmask;

 	/* All blocks after the first must start on a block size boundary. */
 	if (s->npages != 0 && (off & s->bmask) != 0)
@ -1237,8 +1258,6 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
 	return (0);
 }

-#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \
-	defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC)
 BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error)
 {
 	zio_t *zio = bio->bi_private;
@ -1253,54 +1272,99 @@ BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error)
 	zio_interrupt(zio);
 }

+/*
+ * Wrappers for the different secure erase and discard APIs. We use async
+ * when available; in this case, *biop is set to the last bio in the chain.
+ */
 static int
-vdev_issue_discard_trim(zio_t *zio, unsigned long flags)
+vdev_bdev_issue_secure_erase(zfs_bdev_handle_t *bdh, sector_t sector,
+    sector_t nsect, struct bio **biop)
 {
-	int ret;
-	struct bio *bio = NULL;
+	*biop = NULL;
+	int error;

-#if defined(BLKDEV_DISCARD_SECURE)
-	ret = - __blkdev_issue_discard(
-	    BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
-	    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, flags, &bio);
+#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE)
+	error = blkdev_issue_secure_erase(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS)
+	error = __blkdev_issue_discard(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE, biop);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS)
+	error = blkdev_issue_discard(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE);
 #else
-	(void) flags;
-	ret = - __blkdev_issue_discard(
-	    BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
-	    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, &bio);
+#error "unsupported kernel"
 #endif
-	if (!ret && bio) {
+
+	return (error);
+}
+
+static int
+vdev_bdev_issue_discard(zfs_bdev_handle_t *bdh, sector_t sector,
+    sector_t nsect, struct bio **biop)
+{
+	*biop = NULL;
+	int error;
+
+#if defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS)
+	error = __blkdev_issue_discard(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS, 0, biop);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS)
+	error = __blkdev_issue_discard(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS, biop);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS)
+	error = blkdev_issue_discard(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS, 0);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS)
+	error = blkdev_issue_discard(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS);
+#else
+#error "unsupported kernel"
+#endif
+
+	return (error);
+}
+
+/*
+ * Entry point for TRIM ops. This calls the right wrapper for secure erase or
+ * discard, and then does the appropriate finishing work for error vs success
+ * and async vs sync.
+ */
+static int
+vdev_disk_io_trim(zio_t *zio)
+{
+	int error;
+	struct bio *bio;
+
+	zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh;
+	sector_t sector = zio->io_offset >> 9;
+	sector_t nsects = zio->io_size >> 9;
+
+	if (zio->io_trim_flags & ZIO_TRIM_SECURE)
+		error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio);
+	else
+		error = vdev_bdev_issue_discard(bdh, sector, nsects, &bio);
+
+	if (error != 0)
+		return (SET_ERROR(-error));
+
+	if (bio == NULL) {
+		/*
+		 * This was a synchronous op that completed successfully, so
+		 * return it to ZFS immediately.
+		 */
+		zio_interrupt(zio);
+	} else {
+		/*
+		 * This was an asynchronous op; set up completion callback and
+		 * issue it.
+		 */
 		bio->bi_private = zio;
 		bio->bi_end_io = vdev_disk_discard_end_io;
 		vdev_submit_bio(bio);
 	}
-	return (ret);
-}
-#endif

-static int
-vdev_disk_io_trim(zio_t *zio)
-{
-	unsigned long trim_flags = 0;
-	if (zio->io_trim_flags & ZIO_TRIM_SECURE) {
-#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE)
-		return (-blkdev_issue_secure_erase(
-		    BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
-		    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS));
-#elif defined(BLKDEV_DISCARD_SECURE)
-		trim_flags |= BLKDEV_DISCARD_SECURE;
-#endif
-	}
-#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \
-	defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC)
-	return (vdev_issue_discard_trim(zio, trim_flags));
-#elif defined(HAVE_BLKDEV_ISSUE_DISCARD)
-	return (-blkdev_issue_discard(
-	    BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
-	    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags));
-#else
-#error "Unsupported kernel"
-#endif
+	return (0);
 }

 int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
@ -1336,53 +1400,42 @@ vdev_disk_io_start(zio_t *zio)
 	}

 	switch (zio->io_type) {
-	case ZIO_TYPE_IOCTL:
+	case ZIO_TYPE_FLUSH:

 		if (!vdev_readable(v)) {
-			rw_exit(&vd->vd_lock);
-			zio->io_error = SET_ERROR(ENXIO);
-			zio_interrupt(zio);
-			return;
-		}
-
-		switch (zio->io_cmd) {
-		case DKIOCFLUSHWRITECACHE:
-
-			if (zfs_nocacheflush)
-				break;
-
-			if (v->vdev_nowritecache) {
-				zio->io_error = SET_ERROR(ENOTSUP);
-				break;
-			}
-
+			/* Drive not there, can't flush */
+			error = SET_ERROR(ENXIO);
+		} else if (zfs_nocacheflush) {
+			/* Flushing disabled by operator, declare success */
+			error = 0;
+		} else if (v->vdev_nowritecache) {
+			/* This vdev not capable of flushing */
+			error = SET_ERROR(ENOTSUP);
+		} else {
+			/*
+			 * Issue the flush. If successful, the response will
+			 * be handled in the completion callback, so we're done.
+			 */
 			error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio);
 			if (error == 0) {
 				rw_exit(&vd->vd_lock);
 				return;
 			}
-
-			zio->io_error = error;
-
-			break;
-
-		default:
-			zio->io_error = SET_ERROR(ENOTSUP);
 		}

+		/* Couldn't issue the flush, so set the error and return it */
 		rw_exit(&vd->vd_lock);
+		zio->io_error = error;
 		zio_execute(zio);
 		return;

 	case ZIO_TYPE_TRIM:
-		zio->io_error = vdev_disk_io_trim(zio);
+		error = vdev_disk_io_trim(zio);
 		rw_exit(&vd->vd_lock);
-#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE)
-		if (zio->io_trim_flags & ZIO_TRIM_SECURE)
-			zio_interrupt(zio);
-#elif defined(HAVE_BLKDEV_ISSUE_DISCARD)
-		zio_interrupt(zio);
-#endif
+		if (error) {
+			zio->io_error = error;
+			zio_execute(zio);
+		}
 		return;

 	case ZIO_TYPE_READ:
--- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c
@ -242,7 +242,7 @@ vdev_file_io_start(zio_t *zio)
 	vdev_t *vd = zio->io_vd;
 	vdev_file_t *vf = vd->vdev_tsd;

-	if (zio->io_type == ZIO_TYPE_IOCTL) {
+	if (zio->io_type == ZIO_TYPE_FLUSH) {
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = SET_ERROR(ENXIO);
@ -250,33 +250,27 @@ vdev_file_io_start(zio_t *zio)
 			return;
 		}

-		switch (zio->io_cmd) {
-		case DKIOCFLUSHWRITECACHE:
-
-			if (zfs_nocacheflush)
-				break;
-
-			/*
-			 * We cannot safely call vfs_fsync() when PF_FSTRANS
-			 * is set in the current context.  Filesystems like
-			 * XFS include sanity checks to verify it is not
-			 * already set, see xfs_vm_writepage().  Therefore
-			 * the sync must be dispatched to a different context.
-			 */
-			if (__spl_pf_fstrans_check()) {
-				VERIFY3U(taskq_dispatch(vdev_file_taskq,
-				    vdev_file_io_fsync, zio, TQ_SLEEP), !=,
-				    TASKQID_INVALID);
-				return;
-			}
-
-			zio->io_error = zfs_file_fsync(vf->vf_file,
-			    O_SYNC | O_DSYNC);
-			break;
-		default:
-			zio->io_error = SET_ERROR(ENOTSUP);
+		if (zfs_nocacheflush) {
+			zio_execute(zio);
+			return;
 		}

+		/*
+		 * We cannot safely call vfs_fsync() when PF_FSTRANS
+		 * is set in the current context.  Filesystems like
+		 * XFS include sanity checks to verify it is not
+		 * already set, see xfs_vm_writepage().  Therefore
+		 * the sync must be dispatched to a different context.
+		 */
+		if (__spl_pf_fstrans_check()) {
+			VERIFY3U(taskq_dispatch(vdev_file_taskq,
+			    vdev_file_io_fsync, zio, TQ_SLEEP), !=,
+			    TASKQID_INVALID);
+			return;
+		}
+
+		zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC);
+
 		zio_execute(zio);
 		return;
 	} else if (zio->io_type == ZIO_TYPE_TRIM) {
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@ -37,6 +37,7 @@
 #include <sys/spa_impl.h>
 #include <sys/zvol.h>
 #include <sys/zvol_impl.h>
+#include <cityhash.h>

 #include <linux/blkdev_compat.h>
 #include <linux/task_io_accounting_ops.h>
@ -53,6 +54,12 @@ static unsigned int zvol_request_sync = 0;
 static unsigned int zvol_prefetch_bytes = (128 * 1024);
 static unsigned long zvol_max_discard_blocks = 16384;

+/*
+ * Switch taskq at multiple of 512 MB offset. This can be set to a lower value
+ * to utilize more threads for small files but may affect prefetch hits.
+ */
+#define	ZVOL_TASKQ_OFFSET_SHIFT 29
+
 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
 static unsigned int zvol_open_timeout_ms = 1000;
 #endif
@ -76,6 +83,8 @@ static boolean_t zvol_use_blk_mq = B_FALSE;
 static unsigned int zvol_blk_mq_blocks_per_thread = 8;
 #endif

+static unsigned int zvol_num_taskqs = 0;
+
 #ifndef	BLKDEV_DEFAULT_RQ
 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
 #define	BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
@ -114,7 +123,11 @@ struct zvol_state_os {
 	boolean_t use_blk_mq;
 };

-static taskq_t *zvol_taskq;
+typedef struct zv_taskq {
+	uint_t tqs_cnt;
+	taskq_t **tqs_taskq;
+} zv_taskq_t;
+static zv_taskq_t zvol_taskqs;
 static struct ida zvol_ida;

 typedef struct zv_request_stack {
@ -532,6 +545,22 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 	}

 	zv_request_task_t *task;
+	zv_taskq_t *ztqs = &zvol_taskqs;
+	uint_t blk_mq_hw_queue = 0;
+	uint_t tq_idx;
+	uint_t taskq_hash;
+#ifdef HAVE_BLK_MQ
+	if (rq)
+#ifdef HAVE_BLK_MQ_RQ_HCTX
+		blk_mq_hw_queue = rq->mq_hctx->queue_num;
+#else
+		blk_mq_hw_queue =
+		    rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num;
+#endif
+#endif
+	taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
+	    blk_mq_hw_queue, 0);
+	tq_idx = taskq_hash % ztqs->tqs_cnt;

 	if (rw == WRITE) {
 		if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
@ -601,7 +630,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 				zvol_discard(&zvr);
 			} else {
 				task = zv_request_task_create(zvr);
-				taskq_dispatch_ent(zvol_taskq,
+				taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 				    zvol_discard_task, task, 0, &task->ent);
 			}
 		} else {
@ -609,7 +638,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 				zvol_write(&zvr);
 			} else {
 				task = zv_request_task_create(zvr);
-				taskq_dispatch_ent(zvol_taskq,
+				taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 				    zvol_write_task, task, 0, &task->ent);
 			}
 		}
@ -631,7 +660,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 			zvol_read(&zvr);
 		} else {
 			task = zv_request_task_create(zvr);
-			taskq_dispatch_ent(zvol_taskq,
+			taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 			    zvol_read_task, task, 0, &task->ent);
 		}
 	}
@ -1053,6 +1082,16 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
 	if (zso->zvo_disk == NULL)
 		return (1);

+	zso->zvo_disk->minors = ZVOL_MINORS;
+	zso->zvo_queue = zso->zvo_disk->queue;
+#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
+	struct gendisk *disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
+	if (IS_ERR(disk)) {
+		zso->zvo_disk = NULL;
+		return (1);
+	}
+
+	zso->zvo_disk = disk;
 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
 #else
@ -1103,6 +1142,17 @@ zvol_alloc_blk_mq(zvol_state_t *zv)
 	}
 	zso->zvo_queue = zso->zvo_disk->queue;
 	zso->zvo_disk->minors = ZVOL_MINORS;
+#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
+	struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, NULL, zv);
+	if (IS_ERR(disk)) {
+		zso->zvo_disk = NULL;
+		blk_mq_free_tag_set(&zso->tag_set);
+		return (1);
+	}
+
+	zso->zvo_disk = disk;
+	zso->zvo_queue = zso->zvo_disk->queue;
+	zso->zvo_disk->minors = ZVOL_MINORS;
 #else
 	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
 	if (zso->zvo_disk == NULL) {
@ -1256,7 +1306,7 @@ zvol_os_free(zvol_state_t *zv)

 	del_gendisk(zv->zv_zso->zvo_disk);
 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
-	defined(HAVE_BLK_ALLOC_DISK)
+	(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
 #if defined(HAVE_BLK_CLEANUP_DISK)
 	blk_cleanup_disk(zv->zv_zso->zvo_disk);
 #else
@ -1577,8 +1627,40 @@ zvol_init(void)
 		zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
 	}

+	/*
+	 * Use atleast 32 zvol_threads but for many core system,
+	 * prefer 6 threads per taskq, but no more taskqs
+	 * than threads in them on large systems.
+	 *
+	 *                 taskq   total
+	 * cpus    taskqs  threads threads
+	 * ------- ------- ------- -------
+	 * 1       1       32       32
+	 * 2       1       32       32
+	 * 4       1       32       32
+	 * 8       2       16       32
+	 * 16      3       11       33
+	 * 32      5       7        35
+	 * 64      8       8        64
+	 * 128     11      12       132
+	 * 256     16      16       256
+	 */
+	zv_taskq_t *ztqs = &zvol_taskqs;
+	uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs);
+	if (num_tqs == 0) {
+		num_tqs = 1 + num_online_cpus() / 6;
+		while (num_tqs * num_tqs > zvol_actual_threads)
+			num_tqs--;
+	}
+	uint_t per_tq_thread = zvol_actual_threads / num_tqs;
+	if (per_tq_thread * num_tqs < zvol_actual_threads)
+		per_tq_thread++;
+	ztqs->tqs_cnt = num_tqs;
+	ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
 	error = register_blkdev(zvol_major, ZVOL_DRIVER);
 	if (error) {
+		kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *));
+		ztqs->tqs_taskq = NULL;
 		printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
 		return (error);
 	}
@ -1598,11 +1680,22 @@ zvol_init(void)
 		    1024);
 	}
 #endif
-	zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri,
-	    zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
-	if (zvol_taskq == NULL) {
-		unregister_blkdev(zvol_major, ZVOL_DRIVER);
-		return (-ENOMEM);
+	for (uint_t i = 0; i < num_tqs; i++) {
+		char name[32];
+		(void) snprintf(name, sizeof (name), "%s_tq-%u",
+		    ZVOL_DRIVER, i);
+		ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
+		    maxclsyspri, per_tq_thread, INT_MAX,
+		    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+		if (ztqs->tqs_taskq[i] == NULL) {
+			for (int j = i - 1; j >= 0; j--)
+				taskq_destroy(ztqs->tqs_taskq[j]);
+			unregister_blkdev(zvol_major, ZVOL_DRIVER);
+			kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
+			    sizeof (taskq_t *));
+			ztqs->tqs_taskq = NULL;
+			return (-ENOMEM);
+		}
 	}

 	zvol_init_impl();
@ -1613,9 +1706,22 @@ zvol_init(void)
 void
 zvol_fini(void)
 {
+	zv_taskq_t *ztqs = &zvol_taskqs;
 	zvol_fini_impl();
 	unregister_blkdev(zvol_major, ZVOL_DRIVER);
-	taskq_destroy(zvol_taskq);
+
+	if (ztqs->tqs_taskq == NULL) {
+		ASSERT3U(ztqs->tqs_cnt, ==, 0);
+	} else {
+		for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
+			ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
+			taskq_destroy(ztqs->tqs_taskq[i]);
+		}
+		kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
+		    sizeof (taskq_t *));
+		ztqs->tqs_taskq = NULL;
+	}
+
 	ida_destroy(&zvol_ida);
 }

@ -1636,6 +1742,9 @@ MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
 module_param(zvol_max_discard_blocks, ulong, 0444);
 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");

+module_param(zvol_num_taskqs, uint, 0444);
+MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs");
+
 module_param(zvol_prefetch_bytes, uint, 0644);
 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");

--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@ -1960,7 +1960,7 @@ arc_buf_untransform_in_place(arc_buf_t *buf)
 	ASSERT(HDR_ENCRYPTED(hdr));
 	ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
-	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+	ASSERT3PF(hdr->b_l1hdr.b_pabd, !=, NULL, "hdr %px buf %px", hdr, buf);

 	zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
 	    arc_buf_size(buf));
@ -2083,7 +2083,8 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 		 * allocate a new data buffer for the buf.
 		 */
 		if (ARC_BUF_SHARED(buf)) {
-			ASSERT(ARC_BUF_COMPRESSED(buf));
+			ASSERTF(ARC_BUF_COMPRESSED(buf),
+			"buf %p was uncompressed", buf);

 			/* We need to give the buf its own b_data */
 			buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
@ -3872,7 +3873,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,

 	ASSERT3P(marker, !=, NULL);

-	mls = multilist_sublist_lock(ml, idx);
+	mls = multilist_sublist_lock_idx(ml, idx);

 	for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
 	    hdr = multilist_sublist_prev(mls, marker)) {
@ -3984,6 +3985,26 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 	return (bytes_evicted);
 }

+static arc_buf_hdr_t *
+arc_state_alloc_marker(void)
+{
+	arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
+
+	/*
+	 * A b_spa of 0 is used to indicate that this header is
+	 * a marker. This fact is used in arc_evict_state_impl().
+	 */
+	marker->b_spa = 0;
+
+	return (marker);
+}
+
+static void
+arc_state_free_marker(arc_buf_hdr_t *marker)
+{
+	kmem_cache_free(hdr_full_cache, marker);
+}
+
 /*
 * Allocate an array of buffer headers used as placeholders during arc state
 * eviction.
@ -3994,16 +4015,8 @@ arc_state_alloc_markers(int count)
 	arc_buf_hdr_t **markers;

 	markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
-	for (int i = 0; i < count; i++) {
-		markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
-
-		/*
-		 * A b_spa of 0 is used to indicate that this header is
-		 * a marker. This fact is used in arc_evict_state_impl().
-		 */
-		markers[i]->b_spa = 0;
-
-	}
+	for (int i = 0; i < count; i++)
+		markers[i] = arc_state_alloc_marker();
 	return (markers);
 }

@ -4011,7 +4024,7 @@ static void
 arc_state_free_markers(arc_buf_hdr_t **markers, int count)
 {
 	for (int i = 0; i < count; i++)
-		kmem_cache_free(hdr_full_cache, markers[i]);
+		arc_state_free_marker(markers[i]);
 	kmem_free(markers, sizeof (*markers) * count);
 }

@ -4055,7 +4068,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls;

-		mls = multilist_sublist_lock(ml, i);
+		mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_insert_tail(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
@ -4120,7 +4133,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 	}

 	for (int i = 0; i < num_sublists; i++) {
-		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_remove(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
@ -8633,7 +8646,7 @@ l2arc_sublist_lock(int list_num)
 	 * sublists being selected.
 	 */
 	idx = multilist_get_random_index(ml);
-	return (multilist_sublist_lock(ml, idx));
+	return (multilist_sublist_lock_idx(ml, idx));
 }

 /*
@ -9046,9 +9059,9 @@ l2arc_blk_fetch_done(zio_t *zio)
 static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
-	arc_buf_hdr_t 		*hdr, *hdr_prev, *head;
-	uint64_t 		write_asize, write_psize, write_lsize, headroom;
-	boolean_t		full;
+	arc_buf_hdr_t 		*hdr, *head, *marker;
+	uint64_t 		write_asize, write_psize, headroom;
+	boolean_t		full, from_head = !arc_warm;
 	l2arc_write_callback_t	*cb = NULL;
 	zio_t 			*pio, *wzio;
 	uint64_t 		guid = spa_load_guid(spa);
@ -9057,10 +9070,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	ASSERT3P(dev->l2ad_vdev, !=, NULL);

 	pio = NULL;
-	write_lsize = write_asize = write_psize = 0;
+	write_asize = write_psize = 0;
 	full = B_FALSE;
 	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
 	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
+	marker = arc_state_alloc_marker();

 	/*
 	 * Copy buffers for L2ARC writing.
@ -9075,40 +9089,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				continue;
 		}

-		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
 		uint64_t passed_sz = 0;
-
-		VERIFY3P(mls, !=, NULL);
-
-		/*
-		 * L2ARC fast warmup.
-		 *
-		 * Until the ARC is warm and starts to evict, read from the
-		 * head of the ARC lists rather than the tail.
-		 */
-		if (arc_warm == B_FALSE)
-			hdr = multilist_sublist_head(mls);
-		else
-			hdr = multilist_sublist_tail(mls);
-
 		headroom = target_sz * l2arc_headroom;
 		if (zfs_compressed_arc_enabled)
 			headroom = (headroom * l2arc_headroom_boost) / 100;

-		for (; hdr; hdr = hdr_prev) {
+		/*
+		 * Until the ARC is warm and starts to evict, read from the
+		 * head of the ARC lists rather than the tail.
+		 */
+		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
+		ASSERT3P(mls, !=, NULL);
+		if (from_head)
+			hdr = multilist_sublist_head(mls);
+		else
+			hdr = multilist_sublist_tail(mls);
+
+		while (hdr != NULL) {
 			kmutex_t *hash_lock;
 			abd_t *to_write = NULL;

-			if (arc_warm == B_FALSE)
-				hdr_prev = multilist_sublist_next(mls, hdr);
-			else
-				hdr_prev = multilist_sublist_prev(mls, hdr);
-
 			hash_lock = HDR_LOCK(hdr);
 			if (!mutex_tryenter(hash_lock)) {
-				/*
-				 * Skip this buffer rather than waiting.
-				 */
+skip:
+				/* Skip this buffer rather than waiting. */
+				if (from_head)
+					hdr = multilist_sublist_next(mls, hdr);
+				else
+					hdr = multilist_sublist_prev(mls, hdr);
 				continue;
 			}

@ -9123,11 +9131,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)

 			if (!l2arc_write_eligible(guid, hdr)) {
 				mutex_exit(hash_lock);
-				continue;
+				goto skip;
 			}

 			ASSERT(HDR_HAS_L1HDR(hdr));
-
 			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
 			ASSERT3U(arc_hdr_size(hdr), >, 0);
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
@ -9149,12 +9156,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			}

 			/*
-			 * We rely on the L1 portion of the header below, so
-			 * it's invalid for this header to have been evicted out
-			 * of the ghost cache, prior to being written out. The
-			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+			 * We should not sleep with sublist lock held or it
+			 * may block ARC eviction.  Insert a marker to save
+			 * the position and drop the lock.
 			 */
-			arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
+			if (from_head) {
+				multilist_sublist_insert_after(mls, hdr,
+				    marker);
+			} else {
+				multilist_sublist_insert_before(mls, hdr,
+				    marker);
+			}
+			multilist_sublist_unlock(mls);

 			/*
 			 * If this header has b_rabd, we can use this since it
@ -9185,32 +9198,45 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				    &to_write);
 				if (ret != 0) {
 					arc_hdr_clear_flags(hdr,
-					    ARC_FLAG_L2_WRITING);
+					    ARC_FLAG_L2CACHE);
 					mutex_exit(hash_lock);
-					continue;
+					goto next;
 				}

 				l2arc_free_abd_on_write(to_write, asize, type);
 			}

+			hdr->b_l2hdr.b_dev = dev;
+			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
+			hdr->b_l2hdr.b_hits = 0;
+			hdr->b_l2hdr.b_arcs_state =
+			    hdr->b_l1hdr.b_state->arcs_state;
+			mutex_enter(&dev->l2ad_mtx);
 			if (pio == NULL) {
 				/*
 				 * Insert a dummy header on the buflist so
 				 * l2arc_write_done() can find where the
 				 * write buffers begin without searching.
 				 */
-				mutex_enter(&dev->l2ad_mtx);
 				list_insert_head(&dev->l2ad_buflist, head);
-				mutex_exit(&dev->l2ad_mtx);
+			}
+			list_insert_head(&dev->l2ad_buflist, hdr);
+			mutex_exit(&dev->l2ad_mtx);
+			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
+			    ARC_FLAG_L2_WRITING);

+			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
+			    arc_hdr_size(hdr), hdr);
+			l2arc_hdr_arcstats_increment(hdr);
+
+			boolean_t commit = l2arc_log_blk_insert(dev, hdr);
+			mutex_exit(hash_lock);
+
+			if (pio == NULL) {
 				cb = kmem_alloc(
 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
 				cb->l2wcb_dev = dev;
 				cb->l2wcb_head = head;
-				/*
-				 * Create a list to save allocated abd buffers
-				 * for l2arc_log_blk_commit().
-				 */
 				list_create(&cb->l2wcb_abd_list,
 				    sizeof (l2arc_lb_abd_buf_t),
 				    offsetof(l2arc_lb_abd_buf_t, node));
@ -9218,54 +9244,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				    ZIO_FLAG_CANFAIL);
 			}

-			hdr->b_l2hdr.b_dev = dev;
-			hdr->b_l2hdr.b_hits = 0;
-
-			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
-			hdr->b_l2hdr.b_arcs_state =
-			    hdr->b_l1hdr.b_state->arcs_state;
-			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
-
-			mutex_enter(&dev->l2ad_mtx);
-			list_insert_head(&dev->l2ad_buflist, hdr);
-			mutex_exit(&dev->l2ad_mtx);
-
-			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
-			    arc_hdr_size(hdr), hdr);
-
 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
-			    hdr->b_l2hdr.b_daddr, asize, to_write,
+			    dev->l2ad_hand, asize, to_write,
 			    ZIO_CHECKSUM_OFF, NULL, hdr,
 			    ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_CANFAIL, B_FALSE);

-			write_lsize += HDR_GET_LSIZE(hdr);
 			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
 			    zio_t *, wzio);
+			zio_nowait(wzio);

 			write_psize += psize;
 			write_asize += asize;
 			dev->l2ad_hand += asize;
-			l2arc_hdr_arcstats_increment(hdr);
 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);

-			mutex_exit(hash_lock);
-
-			/*
-			 * Append buf info to current log and commit if full.
-			 * arcstat_l2_{size,asize} kstats are updated
-			 * internally.
-			 */
-			if (l2arc_log_blk_insert(dev, hdr)) {
-				/*
-				 * l2ad_hand will be adjusted in
-				 * l2arc_log_blk_commit().
-				 */
+			if (commit) {
+				/* l2ad_hand will be adjusted inside. */
 				write_asize +=
 				    l2arc_log_blk_commit(dev, pio, cb);
 			}

-			zio_nowait(wzio);
+next:
+			multilist_sublist_lock(mls);
+			if (from_head)
+				hdr = multilist_sublist_next(mls, marker);
+			else
+				hdr = multilist_sublist_prev(mls, marker);
+			multilist_sublist_remove(mls, marker);
 		}

 		multilist_sublist_unlock(mls);
@ -9274,9 +9280,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			break;
 	}

+	arc_state_free_marker(marker);
+
 	/* No buffers selected for writing? */
 	if (pio == NULL) {
-		ASSERT0(write_lsize);
+		ASSERT0(write_psize);
 		ASSERT(!HDR_HAS_L1HDR(head));
 		kmem_cache_free(hdr_l2only_cache, head);

@ -10604,7 +10612,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
 	L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
 	L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
 	L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
-	L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
+	L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);

 	dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
 	    HDR_GET_PSIZE(hdr));
--- a/sys/contrib/openzfs/module/zfs/dbuf.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf.c
@ -769,7 +769,7 @@ static void
 dbuf_evict_one(void)
 {
 	int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
-	multilist_sublist_t *mls = multilist_sublist_lock(
+	multilist_sublist_t *mls = multilist_sublist_lock_idx(
 	    &dbuf_caches[DB_DBUF_CACHE].cache, idx);

 	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
@ -1557,17 +1557,14 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
 * returning.
 */
 static int
-dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
+dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
    db_lock_type_t dblt, const void *tag)
 {
-	dnode_t *dn;
 	zbookmark_phys_t zb;
 	uint32_t aflags = ARC_FLAG_NOWAIT;
 	int err, zio_flags;
-	blkptr_t bp, *bpp;
+	blkptr_t bp, *bpp = NULL;

-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
@ -1580,29 +1577,28 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 		goto early_unlock;
 	}

-	if (db->db_state == DB_UNCACHED) {
-		if (db->db_blkptr == NULL) {
-			bpp = NULL;
-		} else {
-			bp = *db->db_blkptr;
+	/*
+	 * If we have a pending block clone, we don't want to read the
+	 * underlying block, but the content of the block being cloned,
+	 * pointed by the dirty record, so we have the most recent data.
+	 * If there is no dirty record, then we hit a race in a sync
+	 * process when the dirty record is already removed, while the
+	 * dbuf is not yet destroyed. Such case is equivalent to uncached.
+	 */
+	if (db->db_state == DB_NOFILL) {
+		dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
+		if (dr != NULL) {
+			if (!dr->dt.dl.dr_brtwrite) {
+				err = EIO;
+				goto early_unlock;
+			}
+			bp = dr->dt.dl.dr_overridden_by;
 			bpp = &bp;
 		}
-	} else {
-		dbuf_dirty_record_t *dr;
+	}

-		ASSERT3S(db->db_state, ==, DB_NOFILL);
-
-		/*
-		 * Block cloning: If we have a pending block clone,
-		 * we don't want to read the underlying block, but the content
-		 * of the block being cloned, so we have the most recent data.
-		 */
-		dr = list_head(&db->db_dirty_records);
-		if (dr == NULL || !dr->dt.dl.dr_brtwrite) {
-			err = EIO;
-			goto early_unlock;
-		}
-		bp = dr->dt.dl.dr_overridden_by;
+	if (bpp == NULL && db->db_blkptr != NULL) {
+		bp = *db->db_blkptr;
 		bpp = &bp;
 	}

@ -1643,8 +1639,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	if (err != 0)
 		goto early_unlock;

-	DB_DNODE_EXIT(db);
-
 	db->db_state = DB_READ;
 	DTRACE_SET_STATE(db, "read issued");
 	mutex_exit(&db->db_mtx);
@ -1669,12 +1663,11 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	 * parent's rwlock, which would be a lock ordering violation.
 	 */
 	dmu_buf_unlock_parent(db, dblt, tag);
-	(void) arc_read(zio, db->db_objset->os_spa, bpp,
+	return (arc_read(zio, db->db_objset->os_spa, bpp,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
-	    &aflags, &zb);
-	return (err);
+	    &aflags, &zb));
+
 early_unlock:
-	DB_DNODE_EXIT(db);
 	mutex_exit(&db->db_mtx);
 	dmu_buf_unlock_parent(db, dblt, tag);
 	return (err);
@ -1759,7 +1752,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 }

 int
-dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 {
 	int err = 0;
 	boolean_t prefetch;
@ -1775,7 +1768,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 	dn = DB_DNODE(db);

 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
-	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL;
+	    (flags & DB_RF_NOPREFETCH) == 0;

 	mutex_enter(&db->db_mtx);
 	if (flags & DB_RF_PARTIAL_FIRST)
@ -1822,13 +1815,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)

 		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);

-		if (zio == NULL && (db->db_state == DB_NOFILL ||
+		if (pio == NULL && (db->db_state == DB_NOFILL ||
 		    (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
 			spa_t *spa = dn->dn_objset->os_spa;
-			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+			pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 			need_wait = B_TRUE;
 		}
-		err = dbuf_read_impl(db, zio, flags, dblt, FTAG);
+		err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
 		/*
 		 * dbuf_read_impl has dropped db_mtx and our parent's rwlock
 		 * for us
@ -1849,9 +1842,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 		 */
 		if (need_wait) {
 			if (err == 0)
-				err = zio_wait(zio);
+				err = zio_wait(pio);
 			else
-				VERIFY0(zio_wait(zio));
+				(void) zio_wait(pio);
+			pio = NULL;
 		}
 	} else {
 		/*
@ -1878,7 +1872,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 				ASSERT(db->db_state == DB_READ ||
 				    (flags & DB_RF_HAVESTRUCT) == 0);
 				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
-				    db, zio_t *, zio);
+				    db, zio_t *, pio);
 				cv_wait(&db->db_changed, &db->db_mtx);
 			}
 			if (db->db_state == DB_UNCACHED)
@ -1887,6 +1881,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 		}
 	}

+	if (pio && err != 0) {
+		zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
+		    ZIO_FLAG_CANFAIL);
+		zio->io_error = err;
+		zio_nowait(zio);
+	}
+
 	return (err);
 }

@ -2631,26 +2632,24 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));

 	/*
-	 * Quick check for dirtiness.  For already dirty blocks, this
-	 * reduces runtime of this function by >90%, and overall performance
-	 * by 50% for some workloads (e.g. file deletion with indirect blocks
-	 * cached).
+	 * Quick check for dirtiness to improve performance for some workloads
+	 * (e.g. file deletion with indirect blocks cached).
 	 */
 	mutex_enter(&db->db_mtx);
-
 	if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) {
-		dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 		/*
-		 * It's possible that it is already dirty but not cached,
+		 * It's possible that the dbuf is already dirty but not cached,
 		 * because there are some calls to dbuf_dirty() that don't
 		 * go through dmu_buf_will_dirty().
 		 */
+		dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 		if (dr != NULL) {
-			if (dr->dt.dl.dr_brtwrite) {
+			if (db->db_level == 0 &&
+			    dr->dt.dl.dr_brtwrite) {
 				/*
 				 * Block cloning: If we are dirtying a cloned
-				 * block, we cannot simply redirty it, because
-				 * this dr has no data associated with it.
+				 * level 0 block, we cannot simply redirty it,
+				 * because this dr has no associated data.
 				 * We will go through a full undirtying below,
 				 * before dirtying it again.
 				 */
@ -4597,11 +4596,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
 		dbuf_prepare_encrypted_dnode_leaf(dr);

-	if (db->db_state != DB_NOFILL &&
+	if (*datap != NULL && *datap == db->db_buf &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    zfs_refcount_count(&db->db_holds) > 1 &&
-	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
-	    *datap == db->db_buf) {
+	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN) {
 		/*
 		 * If this buffer is currently "in use" (i.e., there
 		 * are active holds and db_data still references it),
@ -4890,11 +4888,9 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 	if (db->db_level == 0) {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
-		if (db->db_state != DB_NOFILL) {
-			if (dr->dt.dl.dr_data != NULL &&
-			    dr->dt.dl.dr_data != db->db_buf) {
-				arc_buf_destroy(dr->dt.dl.dr_data, db);
-			}
+		if (dr->dt.dl.dr_data != NULL &&
+		    dr->dt.dl.dr_data != db->db_buf) {
+			arc_buf_destroy(dr->dt.dl.dr_data, db);
 		}
 	} else {
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
@ -5097,21 +5093,18 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)

 	os = dn->dn_objset;

-	if (db->db_state != DB_NOFILL) {
-		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
-			/*
-			 * Private object buffers are released here rather
-			 * than in dbuf_dirty() since they are only modified
-			 * in the syncing context and we don't want the
-			 * overhead of making multiple copies of the data.
-			 */
-			if (BP_IS_HOLE(db->db_blkptr)) {
-				arc_buf_thaw(data);
-			} else {
-				dbuf_release_bp(db);
-			}
-			dbuf_remap(dn, db, tx);
-		}
+	if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
+		/*
+		 * Private object buffers are released here rather than in
+		 * dbuf_dirty() since they are only modified in the syncing
+		 * context and we don't want the overhead of making multiple
+		 * copies of the data.
+		 */
+		if (BP_IS_HOLE(db->db_blkptr))
+			arc_buf_thaw(data);
+		else
+			dbuf_release_bp(db);
+		dbuf_remap(dn, db, tx);
 	}

 	if (parent != dn->dn_dbuf) {
@ -5147,7 +5140,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)

 	if (db->db_blkid == DMU_SPILL_BLKID)
 		wp_flag = WP_SPILL;
-	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
+	wp_flag |= (data == NULL) ? WP_NOFILL : 0;

 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);

@ -5179,7 +5172,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
 		    dr->dt.dl.dr_brtwrite);
 		mutex_exit(&db->db_mtx);
-	} else if (db->db_state == DB_NOFILL) {
+	} else if (data == NULL) {
 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
 		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(pio, os->os_spa, txg,
--- a/sys/contrib/openzfs/module/zfs/dmu.c
+++ b/sys/contrib/openzfs/module/zfs/dmu.c
@ -569,8 +569,10 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 	for (i = 0; i < nblks; i++) {
 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
 		if (db == NULL) {
-			if (zs)
-				dmu_zfetch_run(zs, missed, B_TRUE);
+			if (zs) {
+				dmu_zfetch_run(&dn->dn_zfetch, zs, missed,
+				    B_TRUE);
+			}
 			rw_exit(&dn->dn_struct_rwlock);
 			dmu_buf_rele_array(dbp, nblks, tag);
 			if (read)
@ -606,7 +608,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 		zfs_racct_write(length, nblks);

 	if (zs)
-		dmu_zfetch_run(zs, missed, B_TRUE);
+		dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE);
 	rw_exit(&dn->dn_struct_rwlock);

 	if (read) {
--- a/sys/contrib/openzfs/module/zfs/dmu_objset.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c
@ -1665,7 +1665,7 @@ sync_dnodes_task(void *arg)
 	objset_t *os = soa->soa_os;

 	multilist_sublist_t *ms =
-	    multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
+	    multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);

 	dmu_objset_sync_dnodes(ms, soa->soa_tx);

@ -2076,8 +2076,8 @@ userquota_updates_task(void *arg)
 	dnode_t *dn;
 	userquota_cache_t cache = { { 0 } };

-	multilist_sublist_t *list =
-	    multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
+	multilist_sublist_t *list = multilist_sublist_lock_idx(
+	    &os->os_synced_dnodes, uua->uua_sublist_idx);

 	ASSERT(multilist_sublist_head(list) == NULL ||
 	    dmu_objset_userused_enabled(os));
@ -2159,8 +2159,8 @@ dnode_rele_task(void *arg)
 	userquota_updates_arg_t *uua = arg;
 	objset_t *os = uua->uua_os;

-	multilist_sublist_t *list =
-	    multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
+	multilist_sublist_t *list = multilist_sublist_lock_idx(
+	    &os->os_synced_dnodes, uua->uua_sublist_idx);

 	dnode_t *dn;
 	while ((dn = multilist_sublist_head(list)) != NULL) {
--- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
@ -65,9 +65,16 @@ unsigned int	zfetch_max_distance = 64 * 1024 * 1024;
 #endif
 /* max bytes to prefetch indirects for per stream (default 64MB) */
 unsigned int	zfetch_max_idistance = 64 * 1024 * 1024;
+/* max request reorder distance within a stream (default 16MB) */
+unsigned int	zfetch_max_reorder = 16 * 1024 * 1024;
+/* Max log2 fraction of holes in a stream */
+unsigned int	zfetch_hole_shift = 2;

 typedef struct zfetch_stats {
 	kstat_named_t zfetchstat_hits;
+	kstat_named_t zfetchstat_future;
+	kstat_named_t zfetchstat_stride;
+	kstat_named_t zfetchstat_past;
 	kstat_named_t zfetchstat_misses;
 	kstat_named_t zfetchstat_max_streams;
 	kstat_named_t zfetchstat_io_issued;
@ -76,6 +83,9 @@ typedef struct zfetch_stats {

 static zfetch_stats_t zfetch_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
+	{ "future",			KSTAT_DATA_UINT64 },
+	{ "stride",			KSTAT_DATA_UINT64 },
+	{ "past",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
 	{ "max_streams",		KSTAT_DATA_UINT64 },
 	{ "io_issued",			KSTAT_DATA_UINT64 },
@ -84,6 +94,9 @@ static zfetch_stats_t zfetch_stats = {

 struct {
 	wmsum_t zfetchstat_hits;
+	wmsum_t zfetchstat_future;
+	wmsum_t zfetchstat_stride;
+	wmsum_t zfetchstat_past;
 	wmsum_t zfetchstat_misses;
 	wmsum_t zfetchstat_max_streams;
 	wmsum_t zfetchstat_io_issued;
@ -107,6 +120,12 @@ zfetch_kstats_update(kstat_t *ksp, int rw)
 		return (EACCES);
 	zs->zfetchstat_hits.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_hits);
+	zs->zfetchstat_future.value.ui64 =
+	    wmsum_value(&zfetch_sums.zfetchstat_future);
+	zs->zfetchstat_stride.value.ui64 =
+	    wmsum_value(&zfetch_sums.zfetchstat_stride);
+	zs->zfetchstat_past.value.ui64 =
+	    wmsum_value(&zfetch_sums.zfetchstat_past);
 	zs->zfetchstat_misses.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_misses);
 	zs->zfetchstat_max_streams.value.ui64 =
@ -122,6 +141,9 @@ void
 zfetch_init(void)
 {
 	wmsum_init(&zfetch_sums.zfetchstat_hits, 0);
+	wmsum_init(&zfetch_sums.zfetchstat_future, 0);
+	wmsum_init(&zfetch_sums.zfetchstat_stride, 0);
+	wmsum_init(&zfetch_sums.zfetchstat_past, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_misses, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0);
@ -147,6 +169,9 @@ zfetch_fini(void)
 	}

 	wmsum_fini(&zfetch_sums.zfetchstat_hits);
+	wmsum_fini(&zfetch_sums.zfetchstat_future);
+	wmsum_fini(&zfetch_sums.zfetchstat_stride);
+	wmsum_fini(&zfetch_sums.zfetchstat_past);
 	wmsum_fini(&zfetch_sums.zfetchstat_misses);
 	wmsum_fini(&zfetch_sums.zfetchstat_max_streams);
 	wmsum_fini(&zfetch_sums.zfetchstat_io_issued);
@ -222,22 +247,22 @@ static void
 dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 {
 	zstream_t *zs, *zs_next, *zs_old = NULL;
-	hrtime_t now = gethrtime(), t;
+	uint_t now = gethrestime_sec(), t;

 	ASSERT(MUTEX_HELD(&zf->zf_lock));

 	/*
 	 * Delete too old streams, reusing the first found one.
 	 */
-	t = now - SEC2NSEC(zfetch_max_sec_reap);
+	t = now - zfetch_max_sec_reap;
 	for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) {
 		zs_next = list_next(&zf->zf_stream, zs);
 		/*
 		 * Skip if still active.  1 -- zf_stream reference.
 		 */
-		if (zfs_refcount_count(&zs->zs_refs) != 1)
+		if ((int)(zs->zs_atime - t) >= 0)
 			continue;
-		if (zs->zs_atime > t)
+		if (zfs_refcount_count(&zs->zs_refs) != 1)
 			continue;
 		if (zs_old)
 			dmu_zfetch_stream_remove(zf, zs);
@ -246,6 +271,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 	}
 	if (zs_old) {
 		zs = zs_old;
+		list_remove(&zf->zf_stream, zs);
 		goto reuse;
 	}

@ -255,21 +281,23 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 	 * for all the streams to be non-overlapping.
 	 */
 	uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
-	    zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
+	    (zf->zf_dnode->dn_maxblkid << zf->zf_dnode->dn_datablkshift) /
 	    zfetch_max_distance));
 	if (zf->zf_numstreams >= max_streams) {
-		t = now - SEC2NSEC(zfetch_min_sec_reap);
+		t = now - zfetch_min_sec_reap;
 		for (zs = list_head(&zf->zf_stream); zs != NULL;
 		    zs = list_next(&zf->zf_stream, zs)) {
+			if ((int)(zs->zs_atime - t) >= 0)
+				continue;
 			if (zfs_refcount_count(&zs->zs_refs) != 1)
 				continue;
-			if (zs->zs_atime > t)
-				continue;
-			if (zs_old == NULL || zs->zs_atime < zs_old->zs_atime)
+			if (zs_old == NULL ||
+			    (int)(zs_old->zs_atime - zs->zs_atime) >= 0)
 				zs_old = zs;
 		}
 		if (zs_old) {
 			zs = zs_old;
+			list_remove(&zf->zf_stream, zs);
 			goto reuse;
 		}
 		ZFETCHSTAT_BUMP(zfetchstat_max_streams);
@ -277,24 +305,24 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 	}

 	zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
-	zs->zs_fetch = zf;
 	zfs_refcount_create(&zs->zs_callers);
 	zfs_refcount_create(&zs->zs_refs);
 	/* One reference for zf_stream. */
 	zfs_refcount_add(&zs->zs_refs, NULL);
 	zf->zf_numstreams++;
-	list_insert_head(&zf->zf_stream, zs);

 reuse:
+	list_insert_head(&zf->zf_stream, zs);
 	zs->zs_blkid = blkid;
+	/* Allow immediate stream reuse until first hit. */
+	zs->zs_atime = now - zfetch_min_sec_reap;
+	memset(zs->zs_ranges, 0, sizeof (zs->zs_ranges));
 	zs->zs_pf_dist = 0;
+	zs->zs_ipf_dist = 0;
 	zs->zs_pf_start = blkid;
 	zs->zs_pf_end = blkid;
-	zs->zs_ipf_dist = 0;
 	zs->zs_ipf_start = blkid;
 	zs->zs_ipf_end = blkid;
-	/* Allow immediate stream reuse until first hit. */
-	zs->zs_atime = now - SEC2NSEC(zfetch_min_sec_reap);
 	zs->zs_missed = B_FALSE;
 	zs->zs_more = B_FALSE;
 }
@ -311,6 +339,120 @@ dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued)
 	aggsum_add(&zfetch_sums.zfetchstat_io_active, -1);
 }

+/*
+ * Process stream hit access for nblks blocks starting at zs_blkid.  Return
+ * number of blocks to proceed for after aggregation with future ranges.
+ */
+static uint64_t
+dmu_zfetch_hit(zstream_t *zs, uint64_t nblks)
+{
+	uint_t i, j;
+
+	/* Optimize sequential accesses (no future ranges). */
+	if (zs->zs_ranges[0].start == 0)
+		goto done;
+
+	/* Look for intersections with further ranges. */
+	for (i = 0; i < ZFETCH_RANGES; i++) {
+		zsrange_t *r = &zs->zs_ranges[i];
+		if (r->start == 0 || r->start > nblks)
+			break;
+		if (r->end >= nblks) {
+			nblks = r->end;
+			i++;
+			break;
+		}
+	}
+
+	/* Delete all found intersecting ranges, updates remaining. */
+	for (j = 0; i < ZFETCH_RANGES; i++, j++) {
+		if (zs->zs_ranges[i].start == 0)
+			break;
+		ASSERT3U(zs->zs_ranges[i].start, >, nblks);
+		ASSERT3U(zs->zs_ranges[i].end, >, nblks);
+		zs->zs_ranges[j].start = zs->zs_ranges[i].start - nblks;
+		zs->zs_ranges[j].end = zs->zs_ranges[i].end - nblks;
+	}
+	if (j < ZFETCH_RANGES) {
+		zs->zs_ranges[j].start = 0;
+		zs->zs_ranges[j].end = 0;
+	}
+
+done:
+	zs->zs_blkid += nblks;
+	return (nblks);
+}
+
+/*
+ * Process future stream access for nblks blocks starting at blkid.  Return
+ * number of blocks to proceed for if future ranges reach fill threshold.
+ */
+static uint64_t
+dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks)
+{
+	ASSERT3U(blkid, >, zs->zs_blkid);
+	blkid -= zs->zs_blkid;
+	ASSERT3U(blkid + nblks, <=, UINT16_MAX);
+
+	/* Search for first and last intersection or insert point. */
+	uint_t f = ZFETCH_RANGES, l = 0, i;
+	for (i = 0; i < ZFETCH_RANGES; i++) {
+		zsrange_t *r = &zs->zs_ranges[i];
+		if (r->start == 0 || r->start > blkid + nblks)
+			break;
+		if (r->end < blkid)
+			continue;
+		if (f > i)
+			f = i;
+		if (l < i)
+			l = i;
+	}
+	if (f <= l) {
+		/* Got some intersecting range, expand it if needed. */
+		if (zs->zs_ranges[f].start > blkid)
+			zs->zs_ranges[f].start = blkid;
+		zs->zs_ranges[f].end = MAX(zs->zs_ranges[l].end, blkid + nblks);
+		if (f < l) {
+			/* Got more than one intersection, remove others. */
+			for (f++, l++; l < ZFETCH_RANGES; f++, l++) {
+				zs->zs_ranges[f].start = zs->zs_ranges[l].start;
+				zs->zs_ranges[f].end = zs->zs_ranges[l].end;
+			}
+			zs->zs_ranges[f].start = 0;
+			zs->zs_ranges[f].end = 0;
+		}
+	} else if (i < ZFETCH_RANGES) {
+		/* Got no intersecting ranges, insert new one. */
+		for (l = ZFETCH_RANGES - 1; l > i; l--) {
+			zs->zs_ranges[l].start = zs->zs_ranges[l - 1].start;
+			zs->zs_ranges[l].end = zs->zs_ranges[l - 1].end;
+		}
+		zs->zs_ranges[i].start = blkid;
+		zs->zs_ranges[i].end = blkid + nblks;
+	} else {
+		/* No space left to insert.  Drop the range. */
+		return (0);
+	}
+
+	/* Check if with the new access addition we reached fill threshold. */
+	if (zfetch_hole_shift >= 16)
+		return (0);
+	uint_t hole = 0;
+	for (i = f = l = 0; i < ZFETCH_RANGES; i++) {
+		zsrange_t *r = &zs->zs_ranges[i];
+		if (r->start == 0)
+			break;
+		hole += r->start - f;
+		f = r->end;
+		if (hole <= r->end >> zfetch_hole_shift)
+			l = r->end;
+	}
+	if (l > 0)
+		return (dmu_zfetch_hit(zs, l));
+
+	return (0);
+}
+
 /*
 * This is the predictive prefetch entry point.  dmu_zfetch_prepare()
 * associates dnode access specified with blkid and nblks arguments with
@ -370,53 +512,92 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	mutex_enter(&zf->zf_lock);

 	/*
-	 * Find matching prefetch stream.  Depending on whether the accesses
+	 * Find perfect prefetch stream.  Depending on whether the accesses
 	 * are block-aligned, first block of the new access may either follow
 	 * the last block of the previous access, or be equal to it.
 	 */
+	unsigned int dbs = zf->zf_dnode->dn_datablkshift;
+	uint64_t end_blkid = blkid + nblks;
 	for (zs = list_head(&zf->zf_stream); zs != NULL;
 	    zs = list_next(&zf->zf_stream, zs)) {
 		if (blkid == zs->zs_blkid) {
-			break;
+			goto hit;
 		} else if (blkid + 1 == zs->zs_blkid) {
 			blkid++;
 			nblks--;
-			break;
+			goto hit;
 		}
 	}

 	/*
-	 * If the file is ending, remove the matching stream if found.
-	 * If not found then it is too late to create a new one now.
+	 * Find close enough prefetch stream.  Access crossing stream position
+	 * is a hit in its new part.  Access ahead of stream position considered
+	 * a hit for metadata prefetch, since we do not care about fill percent,
+	 * or stored for future otherwise.  Access behind stream position is
+	 * silently ignored, since we already skipped it reaching fill percent.
 	 */
-	uint64_t end_of_access_blkid = blkid + nblks;
-	if (end_of_access_blkid >= maxblkid) {
-		if (zs != NULL)
-			dmu_zfetch_stream_remove(zf, zs);
-		mutex_exit(&zf->zf_lock);
-		if (!have_lock)
-			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-		return (NULL);
+	uint_t max_reorder = MIN((zfetch_max_reorder >> dbs) + 1, UINT16_MAX);
+	uint_t t = gethrestime_sec() - zfetch_max_sec_reap;
+	for (zs = list_head(&zf->zf_stream); zs != NULL;
+	    zs = list_next(&zf->zf_stream, zs)) {
+		if (blkid > zs->zs_blkid) {
+			if (end_blkid <= zs->zs_blkid + max_reorder) {
+				if (!fetch_data) {
+					nblks = dmu_zfetch_hit(zs,
+					    end_blkid - zs->zs_blkid);
+					ZFETCHSTAT_BUMP(zfetchstat_stride);
+					goto future;
+				}
+				nblks = dmu_zfetch_future(zs, blkid, nblks);
+				if (nblks > 0)
+					ZFETCHSTAT_BUMP(zfetchstat_stride);
+				else
+					ZFETCHSTAT_BUMP(zfetchstat_future);
+				goto future;
+			}
+		} else if (end_blkid >= zs->zs_blkid) {
+			nblks -= zs->zs_blkid - blkid;
+			blkid += zs->zs_blkid - blkid;
+			goto hit;
+		} else if (end_blkid + max_reorder > zs->zs_blkid &&
+		    (int)(zs->zs_atime - t) >= 0) {
+			ZFETCHSTAT_BUMP(zfetchstat_past);
+			zs->zs_atime = gethrestime_sec();
+			goto out;
+		}
 	}

-	/* Exit if we already prefetched this block before. */
-	if (nblks == 0) {
-		mutex_exit(&zf->zf_lock);
-		if (!have_lock)
-			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-		return (NULL);
-	}
+	/*
+	 * This access is not part of any existing stream.  Create a new
+	 * stream for it unless we are at the end of file.
+	 */
+	if (end_blkid < maxblkid)
+		dmu_zfetch_stream_create(zf, end_blkid);
+	mutex_exit(&zf->zf_lock);
+	if (!have_lock)
+		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
+	ZFETCHSTAT_BUMP(zfetchstat_misses);
+	return (NULL);

-	if (zs == NULL) {
-		/*
-		 * This access is not part of any existing stream.  Create
-		 * a new stream for it.
-		 */
-		dmu_zfetch_stream_create(zf, end_of_access_blkid);
+hit:
+	nblks = dmu_zfetch_hit(zs, nblks);
+	ZFETCHSTAT_BUMP(zfetchstat_hits);
+
+future:
+	zs->zs_atime = gethrestime_sec();
+
+	/* Exit if we already prefetched for this position before. */
+	if (nblks == 0)
+		goto out;
+
+	/* If the file is ending, remove the stream. */
+	end_blkid = zs->zs_blkid;
+	if (end_blkid >= maxblkid) {
+		dmu_zfetch_stream_remove(zf, zs);
+out:
 		mutex_exit(&zf->zf_lock);
 		if (!have_lock)
 			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-		ZFETCHSTAT_BUMP(zfetchstat_misses);
 		return (NULL);
 	}

@ -432,7 +613,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	 * than ~6% of ARC held by active prefetches.  It should help with
 	 * getting out of RAM on some badly mispredicted read patterns.
 	 */
-	unsigned int dbs = zf->zf_dnode->dn_datablkshift;
 	unsigned int nbytes = nblks << dbs;
 	unsigned int pf_nblks;
 	if (fetch_data) {
@ -452,10 +632,10 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	} else {
 		pf_nblks = 0;
 	}
-	if (zs->zs_pf_start < end_of_access_blkid)
-		zs->zs_pf_start = end_of_access_blkid;
-	if (zs->zs_pf_end < end_of_access_blkid + pf_nblks)
-		zs->zs_pf_end = end_of_access_blkid + pf_nblks;
+	if (zs->zs_pf_start < end_blkid)
+		zs->zs_pf_start = end_blkid;
+	if (zs->zs_pf_end < end_blkid + pf_nblks)
+		zs->zs_pf_end = end_blkid + pf_nblks;

 	/*
 	 * Do the same for indirects, starting where we will stop reading
@ -473,9 +653,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks)
 		zs->zs_ipf_end = zs->zs_pf_end + pf_nblks;

-	zs->zs_blkid = end_of_access_blkid;
-	/* Protect the stream from reclamation. */
-	zs->zs_atime = gethrtime();
 	zfs_refcount_add(&zs->zs_refs, NULL);
 	/* Count concurrent callers. */
 	zfs_refcount_add(&zs->zs_callers, NULL);
@ -483,15 +660,13 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,

 	if (!have_lock)
 		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-
-	ZFETCHSTAT_BUMP(zfetchstat_hits);
 	return (zs);
 }

 void
-dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
+dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
+    boolean_t have_lock)
 {
-	zfetch_t *zf = zs->zs_fetch;
 	int64_t pf_start, pf_end, ipf_start, ipf_end;
 	int epbs, issued;

@ -567,7 +742,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,

 	zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
 	if (zs)
-		dmu_zfetch_run(zs, missed, have_lock);
+		dmu_zfetch_run(zf, zs, missed, have_lock);
 }

 ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
@ -590,3 +765,9 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,

 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
 	"Max bytes to prefetch indirects for per stream");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_reorder, UINT, ZMOD_RW,
+	"Max request reorder distance within a stream");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, hole_shift, UINT, ZMOD_RW,
+	"Max log2 fraction of holes in a stream");
--- a/sys/contrib/openzfs/module/zfs/metaslab.c
+++ b/sys/contrib/openzfs/module/zfs/metaslab.c
@ -639,7 +639,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 {
 	multilist_t *ml = &mc->mc_metaslab_txg_list;
 	for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
-		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL) {
@ -656,7 +656,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 				i--;
 				break;
 			}
-			mls = multilist_sublist_lock(ml, i);
+			mls = multilist_sublist_lock_idx(ml, i);
 			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
 			multilist_sublist_unlock(mls);
 			if (txg >
@ -2232,12 +2232,12 @@ metaslab_potentially_evict(metaslab_class_t *mc)
 		unsigned int idx = multilist_get_random_index(
 		    &mc->mc_metaslab_txg_list);
 		multilist_sublist_t *mls =
-		    multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx);
+		    multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
 		    inuse * size) {
-			VERIFY3P(mls, ==, multilist_sublist_lock(
+			VERIFY3P(mls, ==, multilist_sublist_lock_idx(
 			    &mc->mc_metaslab_txg_list, idx));
 			ASSERT3U(idx, ==,
 			    metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
--- a/sys/contrib/openzfs/module/zfs/multilist.c
+++ b/sys/contrib/openzfs/module/zfs/multilist.c
@ -277,9 +277,15 @@ multilist_get_random_index(multilist_t *ml)
 	return (random_in_range(ml->ml_num_sublists));
 }

+void
+multilist_sublist_lock(multilist_sublist_t *mls)
+{
+	mutex_enter(&mls->mls_lock);
+}
+
 /* Lock and return the sublist specified at the given index */
 multilist_sublist_t *
-multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
+multilist_sublist_lock_idx(multilist_t *ml, unsigned int sublist_idx)
 {
 	multilist_sublist_t *mls;

@ -294,7 +300,7 @@ multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
 multilist_sublist_t *
 multilist_sublist_lock_obj(multilist_t *ml, void *obj)
 {
-	return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
+	return (multilist_sublist_lock_idx(ml, ml->ml_index_func(ml, obj)));
 }

 void
@ -327,6 +333,22 @@ multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
 	list_insert_tail(&mls->mls_list, obj);
 }

+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_after(multilist_sublist_t *mls, void *prev, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_after(&mls->mls_list, prev, obj);
+}
+
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_before(multilist_sublist_t *mls, void *next, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_before(&mls->mls_list, next, obj);
+}
+
 /*
 * Move the object one element forward in the list.
 *
--- a/sys/contrib/openzfs/module/zfs/spa.c
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@ -180,7 +180,7 @@ static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	{ ZTI_SYNC,	ZTI_N(5),	ZTI_SCALE,	ZTI_N(5) }, /* WRITE */
 	{ ZTI_SCALE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
-	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FLUSH */
 	{ ZTI_N(4),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* TRIM */
 };

--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@ -4924,11 +4924,11 @@ vdev_stat_update(zio_t *zio, uint64_t psize)

 			/*
 			 * TRIM ops and bytes are reported to user space as
-			 * ZIO_TYPE_IOCTL.  This is done to preserve the
+			 * ZIO_TYPE_FLUSH.  This is done to preserve the
 			 * vdev_stat_t structure layout for user space.
 			 */
 			if (type == ZIO_TYPE_TRIM)
-				vs_type = ZIO_TYPE_IOCTL;
+				vs_type = ZIO_TYPE_FLUSH;

 			/*
 			 * Solely for the purposes of 'zpool iostat -lqrw'
@ -6239,12 +6239,12 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 			case VDEV_PROP_OPS_TRIM:
 				/*
 				 * TRIM ops and bytes are reported to user
-				 * space as ZIO_TYPE_IOCTL.  This is done to
+				 * space as ZIO_TYPE_FLUSH.  This is done to
 				 * preserve the vdev_stat_t structure layout
 				 * for user space.
 				 */
 				vdev_prop_add_list(outnvl, propname, NULL,
-				    vd->vdev_stat.vs_ops[ZIO_TYPE_IOCTL],
+				    vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_NULL:
@ -6275,12 +6275,12 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 			case VDEV_PROP_BYTES_TRIM:
 				/*
 				 * TRIM ops and bytes are reported to user
-				 * space as ZIO_TYPE_IOCTL.  This is done to
+				 * space as ZIO_TYPE_FLUSH.  This is done to
 				 * preserve the vdev_stat_t structure layout
 				 * for user space.
 				 */
 				vdev_prop_add_list(outnvl, propname, NULL,
-				    vd->vdev_stat.vs_bytes[ZIO_TYPE_IOCTL],
+				    vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_REMOVING:
--- a/sys/contrib/openzfs/module/zfs/vdev_draid.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c
@ -2548,24 +2548,20 @@ vdev_draid_read_config_spare(vdev_t *vd)
 }

 /*
- * Handle any ioctl requested of the distributed spare.  Only flushes
- * are supported in which case all children must be flushed.
+ * Handle any flush requested of the distributed spare. All children must be
+ * flushed.
 */
 static int
-vdev_draid_spare_ioctl(zio_t *zio)
+vdev_draid_spare_flush(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	int error = 0;

-	if (zio->io_cmd == DKIOCFLUSHWRITECACHE) {
-		for (int c = 0; c < vd->vdev_children; c++) {
-			zio_nowait(zio_vdev_child_io(zio, NULL,
-			    vd->vdev_child[c], zio->io_offset, zio->io_abd,
-			    zio->io_size, zio->io_type, zio->io_priority, 0,
-			    vdev_draid_spare_child_done, zio));
-		}
-	} else {
-		error = SET_ERROR(ENOTSUP);
+	for (int c = 0; c < vd->vdev_children; c++) {
+		zio_nowait(zio_vdev_child_io(zio, NULL,
+		    vd->vdev_child[c], zio->io_offset, zio->io_abd,
+		    zio->io_size, zio->io_type, zio->io_priority, 0,
+		    vdev_draid_spare_child_done, zio));
 	}

 	return (error);
@ -2596,8 +2592,8 @@ vdev_draid_spare_io_start(zio_t *zio)
 	}

 	switch (zio->io_type) {
-	case ZIO_TYPE_IOCTL:
-		zio->io_error = vdev_draid_spare_ioctl(zio);
+	case ZIO_TYPE_FLUSH:
+		zio->io_error = vdev_draid_spare_flush(zio);
 		break;

 	case ZIO_TYPE_WRITE:
--- a/sys/contrib/openzfs/module/zfs/zfs_fm.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_fm.c
@ -1096,10 +1096,7 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
 		return (B_FALSE);

 	if (zio != NULL) {
-		/*
-		 * If this is not a read or write zio, ignore the error.  This
-		 * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
-		 */
+		/* If this is not a read or write zio, ignore the error */
 		if (zio->io_type != ZIO_TYPE_READ &&
 		    zio->io_type != ZIO_TYPE_WRITE)
 			return (B_FALSE);
--- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
@ -40,6 +40,7 @@
 * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved.
 * Copyright (c) 2019, 2021, Klara Inc.
 * Copyright (c) 2019, Allan Jude
+ * Copyright 2024 Oxide Computer Company
 */

 /*
@ -5345,8 +5346,9 @@ zfs_ioc_recv(zfs_cmd_t *zc)

 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
 	    strchr(zc->zc_value, '@') == NULL ||
-	    strchr(zc->zc_value, '%'))
+	    strchr(zc->zc_value, '%') != NULL) {
 		return (SET_ERROR(EINVAL));
+	}

 	(void) strlcpy(tofs, zc->zc_value, sizeof (tofs));
 	tosnap = strchr(tofs, '@');
@ -5354,13 +5356,15 @@ zfs_ioc_recv(zfs_cmd_t *zc)

 	if (zc->zc_nvlist_src != 0 &&
 	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    zc->zc_iflags, &recvdprops)) != 0)
-		return (error);
+	    zc->zc_iflags, &recvdprops)) != 0) {
+		goto out;
+	}

 	if (zc->zc_nvlist_conf != 0 &&
 	    (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
-	    zc->zc_iflags, &localprops)) != 0)
-		return (error);
+	    zc->zc_iflags, &localprops)) != 0) {
+		goto out;
+	}

 	if (zc->zc_string[0])
 		origin = zc->zc_string;
@ -5372,8 +5376,6 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 	error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops,
 	    NULL, zc->zc_guid, B_FALSE, B_FALSE, zc->zc_cookie, &begin_record,
 	    &zc->zc_cookie, &zc->zc_obj, &errors);
-	nvlist_free(recvdprops);
-	nvlist_free(localprops);

 	/*
 	 * Now that all props, initial and delayed, are set, report the prop
@ -5389,7 +5391,10 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 		error = SET_ERROR(EINVAL);
 	}

+out:
 	nvlist_free(errors);
+	nvlist_free(recvdprops);
+	nvlist_free(localprops);

 	return (error);
 }
@ -5456,8 +5461,9 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)

 	if (dataset_namecheck(snapname, NULL, NULL) != 0 ||
 	    strchr(snapname, '@') == NULL ||
-	    strchr(snapname, '%'))
+	    strchr(snapname, '%') != NULL) {
 		return (SET_ERROR(EINVAL));
+	}

 	(void) strlcpy(tofs, snapname, sizeof (tofs));
 	tosnap = strchr(tofs, '@');
@ -5481,15 +5487,15 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 	/* we still use "props" here for backwards compatibility */
 	error = nvlist_lookup_nvlist(innvl, "props", &recvprops);
 	if (error && error != ENOENT)
-		return (error);
+		goto out;

 	error = nvlist_lookup_nvlist(innvl, "localprops", &localprops);
 	if (error && error != ENOENT)
-		return (error);
+		goto out;

 	error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
 	if (error && error != ENOENT)
-		return (error);
+		goto out;

 	error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops,
 	    hidden_args, force, heal, resumable, input_fd, begin_record,
@ -5499,9 +5505,11 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 	fnvlist_add_uint64(outnvl, "error_flags", errflags);
 	fnvlist_add_nvlist(outnvl, "errors", errors);

+out:
 	nvlist_free(errors);
 	nvlist_free(recvprops);
 	nvlist_free(localprops);
+	nvlist_free(hidden_args);

 	return (error);
 }
--- a/sys/contrib/openzfs/module/zfs/zil.c
+++ b/sys/contrib/openzfs/module/zfs/zil.c
@ -125,10 +125,9 @@ static kstat_t *zil_kstats_global;
 int zil_replay_disable = 0;

 /*
- * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to
- * the disk(s) by the ZIL after an LWB write has completed. Setting this
- * will cause ZIL corruption on power loss if a volatile out-of-order
- * write cache is enabled.
+ * Disable the flush commands that are normally sent to the disk(s) by the ZIL
+ * after an LWB write has completed. Setting this will cause ZIL corruption on
+ * power loss if a volatile out-of-order write cache is enabled.
 */
 static int zil_nocacheflush = 0;

@ -1406,19 +1405,17 @@ zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
 }

 /*
- * This function is a called after all vdevs associated with a given lwb
- * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
- * as the lwb write completes, if "zil_nocacheflush" is set. Further,
- * all "previous" lwb's will have completed before this function is
- * called; i.e. this function is called for all previous lwbs before
- * it's called for "this" lwb (enforced via zio the dependencies
- * configured in zil_lwb_set_zio_dependency()).
+ * This function is a called after all vdevs associated with a given lwb write
+ * have completed their flush command; or as soon as the lwb write completes,
+ * if "zil_nocacheflush" is set. Further, all "previous" lwb's will have
+ * completed before this function is called; i.e. this function is called for
+ * all previous lwbs before it's called for "this" lwb (enforced via zio the
+ * dependencies configured in zil_lwb_set_zio_dependency()).
 *
- * The intention is for this function to be called as soon as the
- * contents of an lwb are considered "stable" on disk, and will survive
- * any sudden loss of power. At this point, any threads waiting for the
- * lwb to reach this state are signalled, and the "waiter" structures
- * are marked "done".
+ * The intention is for this function to be called as soon as the contents of
+ * an lwb are considered "stable" on disk, and will survive any sudden loss of
+ * power. At this point, any threads waiting for the lwb to reach this state
+ * are signalled, and the "waiter" structures are marked "done".
 */
 static void
 zil_lwb_flush_vdevs_done(zio_t *zio)
@ -1532,17 +1529,16 @@ zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg)
 }

 /*
- * This is called when an lwb's write zio completes. The callback's
- * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
- * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
- * in writing out this specific lwb's data, and in the case that cache
- * flushes have been deferred, vdevs involved in writing the data for
- * previous lwbs. The writes corresponding to all the vdevs in the
- * lwb_vdev_tree will have completed by the time this is called, due to
- * the zio dependencies configured in zil_lwb_set_zio_dependency(),
- * which takes deferred flushes into account. The lwb will be "done"
- * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
- * completion callback for the lwb's root zio.
+ * This is called when an lwb's write zio completes. The callback's purpose is
+ * to issue the flush commands for the vdevs in the lwb's lwb_vdev_tree. The
+ * tree will contain the vdevs involved in writing out this specific lwb's
+ * data, and in the case that cache flushes have been deferred, vdevs involved
+ * in writing the data for previous lwbs. The writes corresponding to all the
+ * vdevs in the lwb_vdev_tree will have completed by the time this is called,
+ * due to the zio dependencies configured in zil_lwb_set_zio_dependency(),
+ * which takes deferred flushes into account. The lwb will be "done" once
+ * zil_lwb_flush_vdevs_done() is called, which occurs in the zio completion
+ * callback for the lwb's root zio.
 */
 static void
 zil_lwb_write_done(zio_t *zio)
@ -1601,19 +1597,18 @@ zil_lwb_write_done(zio_t *zio)
 	}

 	/*
-	 * If this lwb does not have any threads waiting for it to
-	 * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
-	 * command to the vdevs written to by "this" lwb, and instead
-	 * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
-	 * command for those vdevs. Thus, we merge the vdev tree of
-	 * "this" lwb with the vdev tree of the "next" lwb in the list,
-	 * and assume the "next" lwb will handle flushing the vdevs (or
-	 * deferring the flush(s) again).
+	 * If this lwb does not have any threads waiting for it to complete, we
+	 * want to defer issuing the flush command to the vdevs written to by
+	 * "this" lwb, and instead rely on the "next" lwb to handle the flush
+	 * command for those vdevs. Thus, we merge the vdev tree of "this" lwb
+	 * with the vdev tree of the "next" lwb in the list, and assume the
+	 * "next" lwb will handle flushing the vdevs (or deferring the flush(s)
+	 * again).
 	 *
-	 * This is a useful performance optimization, especially for
-	 * workloads with lots of async write activity and few sync
-	 * write and/or fsync activity, as it has the potential to
-	 * coalesce multiple flush commands to a vdev into one.
+	 * This is a useful performance optimization, especially for workloads
+	 * with lots of async write activity and few sync write and/or fsync
+	 * activity, as it has the potential to coalesce multiple flush
+	 * commands to a vdev into one.
 	 */
 	if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) {
 		zil_lwb_flush_defer(lwb, nlwb);
@ -1663,16 +1658,16 @@ zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
 	 * If the previous lwb's write hasn't already completed, we also want
 	 * to order the completion of the lwb write zios (above, we only order
 	 * the completion of the lwb root zios). This is required because of
-	 * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb.
+	 * how we can defer the flush commands for each lwb.
 	 *
-	 * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous
-	 * lwb will rely on this lwb to flush the vdevs written to by that
-	 * previous lwb. Thus, we need to ensure this lwb doesn't issue the
-	 * flush until after the previous lwb's write completes. We ensure
-	 * this ordering by setting the zio parent/child relationship here.
+	 * When the flush commands are deferred, the previous lwb will rely on
+	 * this lwb to flush the vdevs written to by that previous lwb. Thus,
+	 * we need to ensure this lwb doesn't issue the flush until after the
+	 * previous lwb's write completes. We ensure this ordering by setting
+	 * the zio parent/child relationship here.
 	 *
-	 * Without this relationship on the lwb's write zio, it's possible
-	 * for this lwb's write to complete prior to the previous lwb's write
+	 * Without this relationship on the lwb's write zio, it's possible for
+	 * this lwb's write to complete prior to the previous lwb's write
 	 * completing; and thus, the vdevs for the previous lwb would be
 	 * flushed prior to that lwb's data being written to those vdevs (the
 	 * vdevs are flushed in the lwb write zio's completion handler,
@ -3499,8 +3494,8 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
 *      callback of the lwb's zio[*].
 *
 *      * Actually, the waiters are signaled in the zio completion
- *        callback of the root zio for the DKIOCFLUSHWRITECACHE commands
- *        that are sent to the vdevs upon completion of the lwb zio.
+ *        callback of the root zio for the flush commands that are sent to
+ *        the vdevs upon completion of the lwb zio.
 *
 *   2. When the itxs are inserted into the ZIL's queue of uncommitted
 *      itxs, the order in which they are inserted is preserved[*]; as
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@ -23,7 +23,7 @@
 * Copyright (c) 2011, 2022 by Delphix. All rights reserved.
 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
 * Copyright (c) 2017, Intel Corporation.
- * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, 2023, 2024, Klara Inc.
 * Copyright (c) 2019, Allan Jude
 * Copyright (c) 2021, Datto, Inc.
 */
@ -63,7 +63,7 @@ const char *const zio_type_name[ZIO_TYPES] = {
 	 * Note: Linux kernel thread name length is limited
 	 * so these names will differ from upstream open zfs.
 	 */
-	"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim"
+	"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_flush", "z_trim"
 };

 int zio_dva_throttle_enabled = B_TRUE;
@ -1449,17 +1449,6 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	return (zio);
 }

-zio_t *
-zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    zio_done_func_t *done, void *private, zio_flag_t flags)
-{
-	zio_t *zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
-	    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
-	    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
-	zio->io_cmd = cmd;
-	return (zio);
-}
-
 zio_t *
 zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
    zio_done_func_t *done, void *private, zio_priority_t priority,
@ -1626,15 +1615,25 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
 	return (zio);
 }

+
+/*
+ * Send a flush command to the given vdev. Unlike most zio creation functions,
+ * the flush zios are issued immediately. You can wait on pio to pause until
+ * the flushes complete.
+ */
 void
 zio_flush(zio_t *pio, vdev_t *vd)
 {
+	const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+	    ZIO_FLAG_DONT_RETRY;
+
 	if (vd->vdev_nowritecache)
 		return;
+
 	if (vd->vdev_children == 0) {
-		zio_nowait(zio_ioctl(pio, vd->vdev_spa, vd,
-		    DKIOCFLUSHWRITECACHE, NULL, NULL, ZIO_FLAG_CANFAIL |
-		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
+		zio_nowait(zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0,
+		    NULL, NULL, ZIO_TYPE_FLUSH, ZIO_PRIORITY_NOW, flags, vd, 0,
+		    NULL, ZIO_STAGE_OPEN, ZIO_FLUSH_PIPELINE));
 	} else {
 		for (uint64_t c = 0; c < vd->vdev_children; c++)
 			zio_flush(pio, vd->vdev_child[c]);
@ -4059,6 +4058,16 @@ zio_vdev_io_start(zio_t *zio)
 	    zio->io_type == ZIO_TYPE_WRITE ||
 	    zio->io_type == ZIO_TYPE_TRIM)) {

+		if (zio_handle_device_injection(vd, zio, ENOSYS) != 0) {
+			/*
+			 * "no-op" injections return success, but do no actual
+			 * work. Just skip the remaining vdev stages.
+			 */
+			zio_vdev_io_bypass(zio);
+			zio_interrupt(zio);
+			return (NULL);
+		}
+
 		if ((zio = vdev_queue_io(zio)) == NULL)
 			return (NULL);

@ -4086,14 +4095,17 @@ zio_vdev_io_done(zio_t *zio)
 	}

 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
-	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM);
+	    zio->io_type == ZIO_TYPE_WRITE ||
+	    zio->io_type == ZIO_TYPE_FLUSH ||
+	    zio->io_type == ZIO_TYPE_TRIM);

 	if (zio->io_delay)
 		zio->io_delay = gethrtime() - zio->io_delay;

 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    vd->vdev_ops != &vdev_draid_spare_ops) {
-		vdev_queue_io_done(zio);
+		if (zio->io_type != ZIO_TYPE_FLUSH)
+			vdev_queue_io_done(zio);

 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_device_injections(vd, zio,
@ -4237,8 +4249,7 @@ zio_vdev_io_assess(zio_t *zio)
 	 * boolean flag so that we don't bother with it in the future.
 	 */
 	if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
-	    zio->io_type == ZIO_TYPE_IOCTL &&
-	    zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
+	    zio->io_type == ZIO_TYPE_FLUSH && vd != NULL)
 		vd->vdev_nowritecache = B_TRUE;

 	if (zio->io_error)
--- a/sys/contrib/openzfs/module/zfs/zio_inject.c
+++ b/sys/contrib/openzfs/module/zfs/zio_inject.c
@ -364,10 +364,10 @@ zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2)
 	int ret = 0;

 	/*
-	 * We skip over faults in the labels unless it's during
-	 * device open (i.e. zio == NULL).
+	 * We skip over faults in the labels unless it's during device open
+	 * (i.e. zio == NULL) or a device flush (offset is meaningless)
 	 */
-	if (zio != NULL) {
+	if (zio != NULL && zio->io_type != ZIO_TYPE_FLUSH) {
 		uint64_t offset = zio->io_offset;

 		if (offset < VDEV_LABEL_START_SIZE ||
--- a/sys/contrib/openzfs/scripts/zfs-tests.sh
+++ b/sys/contrib/openzfs/scripts/zfs-tests.sh
@ -326,7 +326,8 @@ OPTIONS:
 	-d DIR      Use world-writable DIR for files and loopback devices
 	-s SIZE     Use vdevs of SIZE (default: 4G)
 	-r RUNFILES Run tests in RUNFILES (default: ${DEFAULT_RUNFILES})
-	-t PATH     Run single test at PATH relative to test suite
+	-t PATH|NAME  Run single test at PATH relative to test suite,
+	                or search for test by NAME
 	-T TAGS     Comma separated list of tags (default: 'functional')
 	-u USER     Run single test as USER (default: root)

@ -340,6 +341,9 @@ $0 -r linux-fast
 # Run a single test
 $0 -t tests/functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh

+# Run a single test by name
+$0 -t zfs_bookmark_cliargs
+
 # Cleanup a previous run of the test suite prior to testing, run the
 # default ($(echo "${DEFAULT_RUNFILES}" | sed 's/\.run//')) suite of tests and perform no cleanup on exit.
 $0 -x
@ -450,8 +454,15 @@ post_user = root
 post =
 outputdir = /var/tmp/test_results
 EOF
-	SINGLETESTDIR="${SINGLETEST%/*}"
+	if [ "$SINGLETEST" = "${SINGLETEST%/*}" ] ; then
+		NEWSINGLETEST=$(find "$STF_SUITE" -name "$SINGLETEST*" -print -quit)
+		if [ -z "$NEWSINGLETEST" ] ; then
+			fail "couldn't find test matching '$SINGLETEST'"
+		fi
+		SINGLETEST=$NEWSINGLETEST
+	fi

+	SINGLETESTDIR="${SINGLETEST%/*}"
 	SETUPDIR="$SINGLETESTDIR"
 	[ "${SETUPDIR#/}" = "$SETUPDIR" ] && SETUPDIR="$STF_SUITE/$SINGLETESTDIR"
 	[ -x "$SETUPDIR/setup.ksh"   ] && SETUPSCRIPT="setup"     || SETUPSCRIPT=
--- a/sys/contrib/openzfs/tests/runfiles/common.run
+++ b/sys/contrib/openzfs/tests/runfiles/common.run
@ -153,6 +153,12 @@ tests = [ 'clean_mirror_001_pos', 'clean_mirror_002_pos',
    'clean_mirror_003_pos', 'clean_mirror_004_pos']
 tags = ['functional', 'clean_mirror']

+[tests/functional/cli_root/zinject]
+tests = ['zinject_args']
+pre =
+post =
+tags = ['functional', 'cli_root', 'zinject']
+
 [tests/functional/cli_root/zdb]
 tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos',
    'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos',
@ -246,7 +252,7 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos',
    'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg',
    'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted',
    'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints',
-    'zfs_mount_test_race']
+    'zfs_mount_test_race', 'zfs_mount_recursive']
 tags = ['functional', 'cli_root', 'zfs_mount']

 [tests/functional/cli_root/zfs_program]
@ -965,6 +971,12 @@ tests = [
    'userspace_send_encrypted', 'userspace_encrypted_13709']
 tags = ['functional', 'userquota']

+[tests/functional/vdev_disk:Linux]
+pre =
+post =
+tests = ['page_alignment']
+tags = ['functional', 'vdev_disk']
+
 [tests/functional/vdev_zaps]
 tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos',
    'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos',
--- a/sys/contrib/openzfs/tests/runfiles/sanity.run
+++ b/sys/contrib/openzfs/tests/runfiles/sanity.run
@ -155,7 +155,8 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos',
    'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos',
    'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg',
    'zfs_mount_012_pos', 'zfs_mount_encrypted', 'zfs_mount_remount',
-    'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', 'zfs_mount_test_race']
+    'zfs_mount_all_fail', 'zfs_mount_all_mountpoints',
+    'zfs_mount_test_race', 'zfs_mount_recursive']
 tags = ['functional', 'cli_root', 'zfs_mount']

 [tests/functional/cli_root/zfs_program]
@ -599,6 +600,12 @@ tags = ['functional', 'truncate']
 tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool']
 tags = ['functional', 'upgrade']

+[tests/functional/vdev_disk:Linux]
+pre =
+post =
+tests = ['page_alignment']
+tags = ['functional', 'vdev_disk']
+
 [tests/functional/vdev_zaps]
 tests = ['vdev_zaps_001_pos', 'vdev_zaps_003_pos', 'vdev_zaps_004_pos',
    'vdev_zaps_005_pos', 'vdev_zaps_006_pos']
--- a/sys/contrib/openzfs/tests/zfs-tests/Makefile.am
+++ b/sys/contrib/openzfs/tests/zfs-tests/Makefile.am
@ -13,6 +13,9 @@ scripts_zfs_tests_functional_hkdf_PROGRAMS = %D%/tests/functional/hkdf/hkdf_test
 %C%_tests_functional_hkdf_hkdf_test_LDADD = \
 	libzpool.la

+scripts_zfs_tests_functional_vdev_diskdir = $(datadir)/$(PACKAGE)/zfs-tests/tests/functional/vdev_disk
+scripts_zfs_tests_functional_vdev_disk_PROGRAMS = %D%/tests/functional/vdev_disk/page_alignment
+
 scripts_zfs_tests_functional_cp_filesdir = $(datadir)/$(PACKAGE)/zfs-tests/tests/functional/cp_files
 scripts_zfs_tests_functional_cp_files_PROGRAMS = %D%/tests/functional/cp_files/seekflood

--- a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
@ -606,6 +606,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/clean_mirror/clean_mirror_004_pos.ksh \
 	functional/clean_mirror/cleanup.ksh \
 	functional/clean_mirror/setup.ksh \
+	functional/cli_root/zinject/zinject_args.ksh \
 	functional/cli_root/zdb/zdb_002_pos.ksh \
 	functional/cli_root/zdb/zdb_003_pos.ksh \
 	functional/cli_root/zdb/zdb_004_pos.ksh \
@ -769,6 +770,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh \
+	functional/cli_root/zfs_mount/zfs_mount_recursive.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_remount.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_test_race.ksh \
 	functional/cli_root/zfs_mount/zfs_multi_mount.ksh \
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg
@ -31,6 +31,7 @@
 export mountcmd=mount
 export mountforce="$mountcmd -f"
 export mountall="$mountcmd -a"
+export mountrecursive="$mountcmd -R"

 export unmountcmd=unmount
 export unmountforce="$unmountcmd -f"
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh
@ -0,0 +1,146 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2024, iXsystems Inc. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib
+
+#
+# DESCRIPTION:
+# Verify zfs mount -R <filesystems/s> functionality.
+#
+# STRATEGY:
+# 1. Create nested datasets
+# 2. Unmount all datasets
+# 3. Recusrively mount root datasets, this should mount all datasets
+#    present in a pool
+# 4. Unmount all datasets
+# 5. Recusrsively mount child datasets with children. This should mount
+#    child datasets, but not the root dataset or parent datasets
+# 6. Unmount all datasets
+# 7. Mount root dataset recursively again and confirm all child
+#    datasets are mounted.
+#
+
+verify_runnable "both"
+
+function cleanup
+{
+	log_must datasetexists $TESTPOOL/$TESTFS1 && \
+		destroy_dataset $TESTPOOL/$TESTFS1 -R
+	log_must datasetexists $TESTPOOL/$TESTFS2 && \
+		destroy_dataset $TESTPOOL/$TESTFS2 -R
+	log_must datasetexists $TESTPOOL/$TESTFS3 && \
+		destroy_dataset $TESTPOOL/$TESTFS3 -R
+}
+
+function setup_all
+{
+	log_must datasetexists $TESTPOOL/$TESTFS || zfs create $TESTPOOL/$TESTFS
+	log_must zfs create $TESTPOOL/$TESTFS1
+	log_must zfs create $TESTPOOL/$TESTFS2
+	log_must zfs create $TESTPOOL/$TESTFS3
+	log_must zfs create $TESTPOOL/$TESTFS2/child1
+	log_must zfs create $TESTPOOL/$TESTFS2/child2
+	log_must zfs create $TESTPOOL/$TESTFS2/child3
+	log_must zfs create $TESTPOOL/$TESTFS2/child2/subchild
+	log_must zfs create $TESTPOOL/$TESTFS3/child
+}
+
+log_assert "Verify that 'zfs $mountrecursive' successfully, " \
+	"mounts the dataset along with all its children."
+
+log_onexit cleanup
+
+log_must setup_all
+
+log_must zfs $unmountall
+
+log_must zfs $mountrecursive $TESTPOOL
+
+log_must mounted $TESTPOOL
+log_must mounted $TESTPOOL/$TESTFS
+log_must mounted $TESTPOOL/$TESTFS1
+log_must mounted $TESTPOOL/$TESTFS2
+log_must mounted $TESTPOOL/$TESTFS3
+log_must mounted $TESTPOOL/$TESTFS2/child1
+log_must mounted $TESTPOOL/$TESTFS2/child2
+log_must mounted $TESTPOOL/$TESTFS2/child3
+log_must mounted $TESTPOOL/$TESTFS2/child2/subchild
+log_must mounted $TESTPOOL/$TESTFS3/child
+
+log_must zfs $unmountall
+
+log_mustnot mounted $TESTPOOL
+log_mustnot mounted $TESTPOOL/$TESTFS
+log_mustnot mounted $TESTPOOL/$TESTFS1
+log_mustnot mounted $TESTPOOL/$TESTFS2
+log_mustnot mounted $TESTPOOL/$TESTFS3
+log_mustnot mounted $TESTPOOL/$TESTFS2/child1
+log_mustnot mounted $TESTPOOL/$TESTFS2/child2
+log_mustnot mounted $TESTPOOL/$TESTFS2/child3
+log_mustnot mounted $TESTPOOL/$TESTFS2/child2/subchild
+log_mustnot mounted $TESTPOOL/$TESTFS3/child
+
+log_must zfs $mountrecursive $TESTPOOL/$TESTFS2 $TESTPOOL/$TESTFS3
+
+log_mustnot mounted $TESTPOOL
+log_mustnot mounted $TESTPOOL/$TESTFS
+log_mustnot mounted $TESTPOOL/$TESTFS1
+log_must mounted $TESTPOOL/$TESTFS2
+log_must mounted $TESTPOOL/$TESTFS3
+log_must mounted $TESTPOOL/$TESTFS2/child1
+log_must mounted $TESTPOOL/$TESTFS2/child2
+log_must mounted $TESTPOOL/$TESTFS2/child3
+log_must mounted $TESTPOOL/$TESTFS2/child2/subchild
+log_must mounted $TESTPOOL/$TESTFS3/child
+
+log_must zfs $unmountall
+
+log_mustnot mounted $TESTPOOL
+log_mustnot mounted $TESTPOOL/$TESTFS
+log_mustnot mounted $TESTPOOL/$TESTFS1
+log_mustnot mounted $TESTPOOL/$TESTFS2
+log_mustnot mounted $TESTPOOL/$TESTFS3
+log_mustnot mounted $TESTPOOL/$TESTFS2/child1
+log_mustnot mounted $TESTPOOL/$TESTFS2/child2
+log_mustnot mounted $TESTPOOL/$TESTFS2/child3
+log_mustnot mounted $TESTPOOL/$TESTFS2/child2/subchild
+log_mustnot mounted $TESTPOOL/$TESTFS3/child
+
+log_must zfs $mountrecursive $TESTPOOL/$TESTFS2/child2
+
+log_must mounted $TESTPOOL/$TESTFS2/child2
+log_must mounted $TESTPOOL/$TESTFS2/child2/subchild
+log_mustnot mounted $TESTPOOL
+log_mustnot mounted $TESTPOOL/$TESTFS
+log_mustnot mounted $TESTPOOL/$TESTFS1
+log_mustnot mounted $TESTPOOL/$TESTFS2
+log_mustnot mounted $TESTPOOL/$TESTFS3
+log_mustnot mounted $TESTPOOL/$TESTFS2/child1
+log_mustnot mounted $TESTPOOL/$TESTFS2/child3
+log_mustnot mounted $TESTPOOL/$TESTFS3/child
+
+log_pass "'zfs $mountrecursive' behaves as expected."
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh
@ -0,0 +1,62 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024, Klara Inc.
+#
+
+#
+# TODO: this only checks that the set of valid device fault types. It should
+#       check all the other options, and that they work, and everything really.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+log_assert "Check zinject parameters."
+
+log_onexit cleanup
+
+DISK1=${DISKS%% *}
+
+function cleanup
+{
+	zinject -c all
+	default_cleanup_noexit
+}
+
+function test_device_fault
+{
+	typeset -a errno=("io" "decompress" "decrypt" "nxio" "dtl" "corrupt" "noop")
+	for e in ${errno[@]}; do
+		log_must eval \
+		    "zinject -d $DISK1 -e $e -T read -f 0.001 $TESTPOOL"
+	done
+	zinject -c all
+}
+
+default_mirror_setup_noexit $DISKS
+
+test_device_fault
+
+log_pass "zinject parameters work as expected."
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/vdev_disk/.gitignore
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/vdev_disk/.gitignore
@ -0,0 +1 @@
+page_alignment
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c
@ -0,0 +1,413 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <sys/param.h>
+#include <stdlib.h>
+
+/*
+ * This tests the vdev_disk page alignment check callback
+ * vdev_disk_check_pages_cb(). For now, this test includes a copy of that
+ * function from module/os/linux/zfs/vdev_disk.c. If you change it here,
+ * remember to change it there too, and add tests data here to validate the
+ * change you're making.
+ */
+
+struct page;
+
+typedef struct {
+	uint32_t  bmask;
+	uint32_t  npages;
+	uint32_t  end;
+} vdev_disk_check_pages_t;
+
+static int
+vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
+{
+	(void) page;
+	vdev_disk_check_pages_t *s = priv;
+
+	/*
+	 * If we didn't finish on a block size boundary last time, then there
+	 * would be a gap if we tried to use this ABD as-is, so abort.
+	 */
+	if (s->end != 0)
+		return (1);
+
+	/*
+	 * Note if we're taking less than a full block, so we can check it
+	 * above on the next call.
+	 */
+	s->end = (off+len) & s->bmask;
+
+	/* All blocks after the first must start on a block size boundary. */
+	if (s->npages != 0 && (off & s->bmask) != 0)
+		return (1);
+
+	s->npages++;
+	return (0);
+}
+
+typedef struct {
+	/* test name */
+	const char	*name;
+
+	/* blocks size mask */
+	uint32_t	mask;
+
+	/* amount of data to take */
+	size_t		size;
+
+	/* [start offset in page, len to end of page or size] */
+	size_t		pages[16][2];
+} page_test_t;
+
+static const page_test_t valid_tests[] = {
+	/* 512B block tests */
+	{
+		"512B blocks, 4K single page",
+		0x1ff, 0x1000, {
+			{ 0x0, 0x1000 },
+		},
+	}, {
+		"512B blocks, 1K at start of page",
+		0x1ff, 0x400, {
+			{ 0x0, 0x1000 },
+		},
+	}, {
+		"512B blocks, 1K at end of page",
+		0x1ff, 0x400, {
+			{ 0x0c00, 0x0400 },
+		},
+	}, {
+		"512B blocks, 1K within page, 512B start offset",
+		0x1ff, 0x400, {
+			{ 0x0200, 0x0e00 },
+		},
+	}, {
+		"512B blocks, 8K across 2x4K pages",
+		0x1ff, 0x2000, {
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+		},
+	}, {
+		"512B blocks, 4K across two pages, 2K start offset",
+		0x1ff, 0x1000, {
+			{ 0x0800, 0x0800 },
+			{ 0x0,    0x0800 },
+		},
+	}, {
+		"512B blocks, 16K across 5x4K pages, 512B start offset",
+		0x1ff, 0x4000, {
+			{ 0x0200, 0x0e00 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x0200 },
+		},
+	}, {
+		"512B blocks, 64K data, 8x8K compound pages",
+		0x1ff, 0x10000, {
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+		},
+	}, {
+		"512B blocks, 64K data, 9x8K compound pages, 512B start offset",
+		0x1ff, 0x10000, {
+			{ 0x0200, 0x1e00 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x0200 },
+		},
+	}, {
+		"512B blocks, 64K data, 2x16K compound pages, 8x4K pages",
+		0x1ff, 0x10000, {
+			{ 0x0, 0x8000 },
+			{ 0x0, 0x8000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+		},
+	}, {
+		"512B blocks, 64K data, mixed 4K/8K/16K pages",
+		0x1ff, 0x10000, {
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x8000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x2000 },
+		},
+	}, {
+		"512B blocks, 64K data, mixed 4K/8K/16K pages, 1K start offset",
+		0x1ff, 0x10000, {
+			{ 0x0400, 0x0c00 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x8000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x0400 },
+		},
+	},
+
+	/* 4K block tests */
+	{
+		"4K blocks, 4K single page",
+		0xfff, 0x1000, {
+			{ 0x0, 0x1000 },
+		},
+	}, {
+		"4K blocks, 1K at start of page",
+		0xfff, 0x400, {
+			{ 0x0, 0x1000 },
+		},
+	}, {
+		"4K blocks, 1K at end of page",
+		0xfff, 0x400, {
+			{ 0x0c00, 0x0400 },
+		},
+	}, {
+		"4K blocks, 1K within page, 512B start offset",
+		0xfff, 0x400, {
+			{ 0x0200, 0x0e00 },
+		},
+	}, {
+		"4K blocks, 8K across 2x4K pages",
+		0xfff, 0x2000, {
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+		},
+	}, {
+		"4K blocks, 4K across two pages, 2K start offset",
+		0xfff, 0x1000, {
+			{ 0x0800, 0x0800 },
+			{ 0x0,    0x0800 },
+		},
+	}, {
+		"4K blocks, 16K across 5x4K pages, 512B start offset",
+		0xfff, 0x4000, {
+			{ 0x0200, 0x0e00 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x0200 },
+		},
+	}, {
+		"4K blocks, 64K data, 8x8K compound pages",
+		0xfff, 0x10000, {
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+		},
+	}, {
+		"4K blocks, 64K data, 9x8K compound pages, 512B start offset",
+		0xfff, 0x10000, {
+			{ 0x0200, 0x1e00 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x0200 },
+		},
+	}, {
+		"4K blocks, 64K data, 2x16K compound pages, 8x4K pages",
+		0xfff, 0x10000, {
+			{ 0x0, 0x8000 },
+			{ 0x0, 0x8000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+		},
+	}, {
+		"4K blocks, 64K data, mixed 4K/8K/16K pages",
+		0xfff, 0x10000, {
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x8000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x2000 },
+		},
+	}, {
+		"4K blocks, 64K data, mixed 4K/8K/16K pages, 1K start offset",
+		0xfff, 0x10000, {
+			{ 0x0400, 0x0c00 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x8000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x0400 },
+		},
+	},
+
+	{ 0 },
+};
+
+static const page_test_t invalid_tests[] = {
+	{
+		"512B blocks, 16K data, 512 leader (gang block simulation)",
+		0x1ff, 0x8000, {
+			{ 0x0, 0x0200 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x0c00 },
+		},
+	}, {
+		"4K blocks, 32K data, 2 incompatible spans "
+		"(gang abd simulation)",
+		0xfff, 0x8000, {
+			{ 0x0800, 0x0800 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x0800 },
+			{ 0x0800, 0x0800 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x0800 },
+		},
+	},
+	{ 0 },
+};
+
+static bool
+run_test(const page_test_t *test, bool verbose)
+{
+	size_t rem = test->size;
+
+	vdev_disk_check_pages_t s = {
+	    .bmask = 0xfff,
+	    .npages = 0,
+	    .end = 0,
+	};
+
+	for (int i = 0; test->pages[i][1] > 0; i++) {
+		size_t off = test->pages[i][0];
+		size_t len = test->pages[i][1];
+
+		size_t take = MIN(rem, len);
+
+		if (verbose)
+			printf("  page %d [off %lx len %lx], "
+			    "rem %lx, take %lx\n",
+			    i, off, len, rem, take);
+
+		if (vdev_disk_check_pages_cb(NULL, off, take, &s)) {
+			if (verbose)
+				printf("  ABORT: misalignment detected, "
+				    "rem %lx\n", rem);
+			return (false);
+		}
+
+		rem -= take;
+		if (rem == 0)
+			break;
+	}
+
+	if (rem > 0) {
+		if (verbose)
+			printf("  ABORT: ran out of pages, rem %lx\n", rem);
+		return (false);
+	}
+
+	return (true);
+}
+
+static void
+run_test_set(const page_test_t *tests, bool want, int *ntests, int *npassed)
+{
+	for (const page_test_t *test = &tests[0]; test->name; test++) {
+		bool pass = (run_test(test, false) == want);
+		if (pass) {
+			printf("%s: PASS\n", test->name);
+			(*npassed)++;
+		} else {
+			printf("%s: FAIL [expected %s, got %s]\n", test->name,
+			    want ? "VALID" : "INVALID",
+			    want ? "INVALID" : "VALID");
+			run_test(test, true);
+		}
+		(*ntests)++;
+	}
+}
+
+int main(void) {
+	int ntests = 0, npassed = 0;
+
+	run_test_set(valid_tests, true, &ntests, &npassed);
+	run_test_set(invalid_tests, false, &ntests, &npassed);
+
+	printf("\n%d/%d tests passed\n", npassed, ntests);
+
+	return (ntests == npassed ? 0 : 1);
+}
--- a/sys/modules/zfs/zfs_config.h
+++ b/sys/modules/zfs/zfs_config.h
@ -93,6 +93,9 @@
 /* bdev_check_media_change() exists */
 /* #undef HAVE_BDEV_CHECK_MEDIA_CHANGE */

+/* bdev_file_open_by_path() exists */
+/* #undef HAVE_BDEV_FILE_OPEN_BY_PATH */
+
 /* bdev_*_io_acct() available */
 /* #undef HAVE_BDEV_IO_ACCT_63 */

@ -159,15 +162,24 @@
 /* blkdev_get_by_path() handles ERESTARTSYS */
 /* #undef HAVE_BLKDEV_GET_ERESTARTSYS */

-/* blkdev_issue_discard() is available */
-/* #undef HAVE_BLKDEV_ISSUE_DISCARD */
+/* __blkdev_issue_discard(flags) is available */
+/* #undef HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS */

 /* __blkdev_issue_discard() is available */
-/* #undef HAVE_BLKDEV_ISSUE_DISCARD_ASYNC */
+/* #undef HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS */
+
+/* blkdev_issue_discard(flags) is available */
+/* #undef HAVE_BLKDEV_ISSUE_DISCARD_FLAGS */
+
+/* blkdev_issue_discard() is available */
+/* #undef HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS */

 /* blkdev_issue_secure_erase() is available */
 /* #undef HAVE_BLKDEV_ISSUE_SECURE_ERASE */

+/* blkdev_put() exists */
+/* #undef HAVE_BLKDEV_PUT */
+
 /* blkdev_put() accepts void* as arg 2 */
 /* #undef HAVE_BLKDEV_PUT_HOLDER */

@ -183,6 +195,9 @@
 /* blk_alloc_disk() exists */
 /* #undef HAVE_BLK_ALLOC_DISK */

+/* blk_alloc_disk() exists and takes 2 args */
+/* #undef HAVE_BLK_ALLOC_DISK_2ARG */
+
 /* blk_alloc_queue() expects request function */
 /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN */

@ -198,6 +213,9 @@
 /* block multiqueue is available */
 /* #undef HAVE_BLK_MQ */

+/* block multiqueue hardware context is cached in struct request */
+/* #undef HAVE_BLK_MQ_RQ_HCTX */
+
 /* blk queue backing_dev_info is dynamic */
 /* #undef HAVE_BLK_QUEUE_BDI_DYNAMIC */

@ -325,8 +343,8 @@
 /* sops->evict_inode() exists */
 /* #undef HAVE_EVICT_INODE */

-/* Define to 1 if you have the `execvpe' function. */
-/* #undef HAVE_EXECVPE */
+/* Define to 1 if you have the 'execvpe' function. */
+#define HAVE_EXECVPE 1

 /* FALLOC_FL_ZERO_RANGE is defined */
 /* #undef HAVE_FALLOC_FL_ZERO_RANGE */
@ -553,7 +571,7 @@
 /* yes */
 /* #undef HAVE_IO_SCHEDULE_TIMEOUT */

-/* Define to 1 if you have the `issetugid' function. */
+/* Define to 1 if you have the 'issetugid' function. */
 #define HAVE_ISSETUGID 1

 /* iter_iov() is available */
@ -661,7 +679,7 @@
 /* iops->mkdir() takes umode_t */
 /* #undef HAVE_MKDIR_UMODE_T */

-/* Define to 1 if you have the `mlockall' function. */
+/* Define to 1 if you have the 'mlockall' function. */
 #define HAVE_MLOCKALL 1

 /* page_size() is available */
@ -867,10 +885,10 @@
 /* Define to 1 if you have the <string.h> header file. */
 #define HAVE_STRING_H 1

-/* Define to 1 if you have the `strlcat' function. */
+/* Define to 1 if you have the 'strlcat' function. */
 #define HAVE_STRLCAT 1

-/* Define to 1 if you have the `strlcpy' function. */
+/* Define to 1 if you have the 'strlcpy' function. */
 #define HAVE_STRLCPY 1

 /* submit_bio is member of struct block_device_operations */
@ -918,7 +936,7 @@
 /* kernel has totalram_pages() */
 /* #undef HAVE_TOTALRAM_PAGES_FUNC */

-/* Define to 1 if you have the `udev_device_get_is_initialized' function. */
+/* Define to 1 if you have the 'udev_device_get_is_initialized' function. */
 /* #undef HAVE_UDEV_DEVICE_GET_IS_INITIALIZED */

 /* kernel has __kernel_fpu_* functions */
@ -1113,7 +1131,7 @@
 /* pde_data() is PDE_DATA() */
 /* #undef SPL_PDE_DATA */

-/* Define to 1 if all of the C90 standard headers exist (not just the ones
+/* Define to 1 if all of the C89 standard headers exist (not just the ones
   required in a freestanding environment). This macro is provided for
   backward compatibility; new code need not use it. */
 #define SYSTEM_FREEBSD 1
@ -1161,7 +1179,7 @@
 /* #undef ZFS_IS_GPL_COMPATIBLE */

 /* Define the project alias string. */
-#define ZFS_META_ALIAS "zfs-2.2.99-398-FreeBSD_g39be46f43"
+#define ZFS_META_ALIAS "zfs-2.2.99-440-FreeBSD_g90ba19eb7"

 /* Define the project author. */
 #define ZFS_META_AUTHOR "OpenZFS"
@ -1191,7 +1209,7 @@
 #define ZFS_META_NAME "zfs"

 /* Define the project release. */
-#define ZFS_META_RELEASE "398-FreeBSD_g39be46f43"
+#define ZFS_META_RELEASE "440-FreeBSD_g90ba19eb7"

 /* Define the project version. */
 #define ZFS_META_VERSION "2.2.99"
--- a/sys/modules/zfs/zfs_gitrev.h
+++ b/sys/modules/zfs/zfs_gitrev.h
@ -1 +1 @@
-#define	ZFS_META_GITREV "zfs-2.2.99-398-g39be46f43"
+#define	ZFS_META_GITREV "zfs-2.2.99-440-g90ba19eb7"