Kernel/NVMe: Add initial NVMe driver support

Add a basic NVMe driver support to serenity
based on NVMe spec 1.4.

The driver can support multiple NVMe drives (subsystems).
But in a NVMe drive, the driver can support one controller
with multiple namespaces.

Each core will get a separate NVMe Queue.
As the system lacks MSI support, PIN based interrupts are
used for IO.

Tested the NVMe support by replacing IDE driver
with the NVMe driver :^)
This commit is contained in:
Pankaj Raghav 2021-12-16 20:37:54 +05:30 committed by Andreas Kling
parent 602b35aa62
commit e99fafb683
13 changed files with 946 additions and 0 deletions

View file

@ -75,6 +75,7 @@ namespace MassStorage {
enum class SubclassID {
IDEController = 0x1,
SATAController = 0x6,
NVMeController = 0x8,
};
enum class SATAProgIF {
AHCI = 0x1,

View file

@ -97,6 +97,9 @@ set(KERNEL_SOURCES
Storage/Partition/GUIDPartitionTable.cpp
Storage/Partition/MBRPartitionTable.cpp
Storage/Partition/PartitionTable.cpp
Storage/NVMe/NVMeController.cpp
Storage/NVMe/NVMeNameSpace.cpp
Storage/NVMe/NVMeQueue.cpp
Storage/StorageDevice.cpp
Storage/RamdiskController.cpp
Storage/RamdiskDevice.cpp

View file

@ -210,6 +210,10 @@
#cmakedefine01 NETWORK_TASK_DEBUG
#endif
#ifndef NVME_DEBUG
#cmakedefine01 NVME_DEBUG
#endif
#ifndef OFFD_DEBUG
#cmakedefine01 OFFD_DEBUG
#endif

View file

@ -68,6 +68,8 @@ protected:
: Device(major, minor)
, m_block_size(block_size)
{
// 512 is the minimum sector size in most block devices
VERIFY(m_block_size >= 512);
}
private:

View file

@ -0,0 +1,327 @@
/*
* Copyright (c) 2021, Pankaj R <pankydev8@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include "NVMeController.h"
#include "AK/Format.h"
#include <AK/RefPtr.h>
#include <AK/Types.h>
#include <Kernel/Arch/x86/IO.h>
#include <Kernel/Arch/x86/Processor.h>
#include <Kernel/Bus/PCI/API.h>
#include <Kernel/Devices/Device.h>
#include <Kernel/FileSystem/ProcFS.h>
#include <Kernel/Sections.h>
namespace Kernel {
Atomic<u8> NVMeController::controller_id {};
ErrorOr<NonnullRefPtr<NVMeController>> NVMeController::try_initialize(const Kernel::PCI::DeviceIdentifier& device_identifier)
{
auto controller = TRY(adopt_nonnull_ref_or_enomem(new NVMeController(device_identifier)));
TRY(controller->initialize());
NVMeController::controller_id++;
return controller;
}
NVMeController::NVMeController(const PCI::DeviceIdentifier& device_identifier)
: PCI::Device(device_identifier.address())
, m_pci_device_id(device_identifier)
{
}
ErrorOr<void> NVMeController::initialize()
{
// Nr of queues = one queue per core
auto nr_of_queues = Processor::count();
auto irq = m_pci_device_id.interrupt_line().value();
PCI::enable_memory_space(m_pci_device_id.address());
PCI::enable_bus_mastering(m_pci_device_id.address());
m_bar = PCI::get_BAR0(m_pci_device_id.address()) & BAR_ADDR_MASK;
static_assert(sizeof(ControllerRegister) == REG_SQ0TDBL_START);
// Map only until doorbell register for the controller
// Queues will individually map the doorbell register respectively
m_controller_regs = Memory::map_typed_writable<ControllerRegister>(PhysicalAddress(m_bar));
calculate_doorbell_stride();
TRY(create_admin_queue(irq));
VERIFY(m_admin_queue_ready == true);
VERIFY(IO_QUEUE_SIZE < MQES(m_controller_regs->cap));
dbgln_if(NVME_DEBUG, "NVMe: IO queue depth is: {}", IO_QUEUE_SIZE);
// Create an IO queue per core
for (u32 cpuid = 0; cpuid < nr_of_queues; ++cpuid) {
// qid is zero is used for admin queue
TRY(create_io_queue(irq, cpuid + 1));
}
TRY(identify_and_init_namespaces());
return {};
}
bool NVMeController::reset_controller()
{
volatile u32 cc, csts;
csts = m_controller_regs->csts;
if ((csts & (1 << CSTS_RDY_BIT)) != 0x1)
return false;
cc = m_controller_regs->cc;
cc = cc & ~(1 << CC_EN_BIT);
m_controller_regs->cc = cc;
IO::delay(10);
full_memory_barrier();
csts = m_controller_regs->csts;
if ((csts & (1 << CSTS_RDY_BIT)) != 0x0)
return false;
return true;
}
bool NVMeController::start_controller()
{
volatile u32 cc, csts;
csts = m_controller_regs->csts;
if ((csts & (1 << CSTS_RDY_BIT)) != 0x0)
return false;
cc = m_controller_regs->cc;
cc = cc | (1 << CC_EN_BIT);
cc = cc | (CQ_WIDTH << CC_IOCQES_BIT);
cc = cc | (SQ_WIDTH << CC_IOSQES_BIT);
m_controller_regs->cc = cc;
IO::delay(10);
full_memory_barrier();
csts = m_controller_regs->csts;
if ((csts & (1 << CSTS_RDY_BIT)) != 0x1)
return false;
return true;
}
u32 NVMeController::get_admin_q_dept()
{
u32 aqa = m_controller_regs->aqa;
// Queue depth is 0 based
u32 q_depth = min(ACQ_SIZE(aqa), ASQ_SIZE(aqa)) + 1;
dbgln_if(NVME_DEBUG, "NVMe: Admin queue depth is {}", q_depth);
return q_depth;
}
ErrorOr<void> NVMeController::identify_and_init_namespaces()
{
RefPtr<Memory::PhysicalPage> prp_dma_buffer;
OwnPtr<Memory::Region> prp_dma_region;
auto namespace_data_struct = ByteBuffer::create_zeroed(NVMe_IDENTIFY_SIZE).release_value();
u32 active_namespace_list[NVMe_IDENTIFY_SIZE / sizeof(u32)];
{
auto buffer = TRY(MM.allocate_dma_buffer_page("Identify PRP", Memory::Region::Access::ReadWrite, prp_dma_buffer));
prp_dma_region = move(buffer);
}
// Get the active namespace
{
NVMeSubmission sub {};
u16 status = 0;
sub.op = OP_ADMIN_IDENTIFY;
sub.data_ptr.prp1 = reinterpret_cast<u64>(AK::convert_between_host_and_little_endian(prp_dma_buffer->paddr().as_ptr()));
sub.cdw10 = NVMe_CNS_ID_ACTIVE_NS & 0xff;
status = submit_admin_command(sub, true);
if (status) {
dmesgln("Failed to identify active namespace command");
return EFAULT;
}
if (void* fault_at; !safe_memcpy(active_namespace_list, prp_dma_region->vaddr().as_ptr(), NVMe_IDENTIFY_SIZE, fault_at)) {
return EFAULT;
}
}
// Get the NAMESPACE attributes
{
NVMeSubmission sub {};
IdentifyNamespace id_ns {};
u16 status = 0;
for (auto nsid : active_namespace_list) {
memset(prp_dma_region->vaddr().as_ptr(), 0, NVMe_IDENTIFY_SIZE);
// Invalid NS
if (nsid == 0)
break;
sub.op = OP_ADMIN_IDENTIFY;
sub.data_ptr.prp1 = reinterpret_cast<u64>(AK::convert_between_host_and_little_endian(prp_dma_buffer->paddr().as_ptr()));
sub.cdw10 = NVMe_CNS_ID_NS & 0xff;
sub.nsid = nsid;
status = submit_admin_command(sub, true);
if (status) {
dmesgln("Failed identify namespace with nsid {}", nsid);
return EFAULT;
}
static_assert(sizeof(IdentifyNamespace) == NVMe_IDENTIFY_SIZE);
if (void* fault_at; !safe_memcpy(&id_ns, prp_dma_region->vaddr().as_ptr(), NVMe_IDENTIFY_SIZE, fault_at)) {
return EFAULT;
}
auto val = get_ns_features(id_ns);
auto block_counts = val.get<0>();
auto block_size = 1 << val.get<1>();
dbgln_if(NVME_DEBUG, "NVMe: Block count is {} and Block size is {}", block_counts, block_size);
m_namespaces.append(TRY(NVMeNameSpace::try_create(m_queues, controller_id.load(), nsid, block_counts, block_size)));
m_device_count++;
dbgln_if(NVME_DEBUG, "NVMe: Initialized namespace with NSID: {}", nsid);
}
}
return {};
}
Tuple<u64, u8> NVMeController::get_ns_features(IdentifyNamespace& identify_data_struct)
{
auto flbas = identify_data_struct.flbas & FLBA_SIZE_MASK;
auto namespace_size = identify_data_struct.nsze;
auto lba_format = identify_data_struct.lbaf[flbas];
auto lba_size = (lba_format & LBA_SIZE_MASK) >> 16;
return Tuple<u64, u8>(namespace_size, lba_size);
}
RefPtr<StorageDevice> NVMeController::device(u32 index) const
{
return m_namespaces.at(index);
}
size_t NVMeController::devices_count() const
{
return m_device_count;
}
bool NVMeController::reset()
{
if (!reset_controller())
return false;
if (!start_controller())
return false;
return true;
}
bool NVMeController::shutdown()
{
TODO();
return false;
}
void NVMeController::complete_current_request([[maybe_unused]] AsyncDeviceRequest::RequestResult result)
{
VERIFY_NOT_REACHED();
}
ErrorOr<void> NVMeController::create_admin_queue(u8 irq)
{
auto qdepth = get_admin_q_dept();
OwnPtr<Memory::Region> cq_dma_region;
NonnullRefPtrVector<Memory::PhysicalPage> cq_dma_pages;
OwnPtr<Memory::Region> sq_dma_region;
NonnullRefPtrVector<Memory::PhysicalPage> sq_dma_pages;
auto cq_size = round_up_to_power_of_two(CQ_SIZE(qdepth), 4096);
auto sq_size = round_up_to_power_of_two(SQ_SIZE(qdepth), 4096);
if (!reset_controller()) {
dmesgln("Failed to reset the NVMe controller");
return EFAULT;
}
{
auto buffer = TRY(MM.allocate_dma_buffer_pages(cq_size, "Admin CQ queue", Memory::Region::Access::ReadWrite, cq_dma_pages));
cq_dma_region = move(buffer);
}
// Phase bit is important to determine completion, so zero out the space
// so that we don't get any garbage phase bit value
memset(cq_dma_region->vaddr().as_ptr(), 0, cq_size);
{
auto buffer = TRY(MM.allocate_dma_buffer_pages(sq_size, "Admin SQ queue", Memory::Region::Access::ReadWrite, sq_dma_pages));
sq_dma_region = move(buffer);
}
auto doorbell_regs = Memory::map_typed_writable<DoorbellRegister>(PhysicalAddress(m_bar + REG_SQ0TDBL_START));
m_admin_queue = TRY(NVMeQueue::try_create(0, irq, qdepth, move(cq_dma_region), cq_dma_pages, move(sq_dma_region), sq_dma_pages, move(doorbell_regs)));
m_controller_regs->acq = reinterpret_cast<u64>(AK::convert_between_host_and_little_endian(cq_dma_pages.first().paddr().as_ptr()));
m_controller_regs->asq = reinterpret_cast<u64>(AK::convert_between_host_and_little_endian(sq_dma_pages.first().paddr().as_ptr()));
if (!start_controller()) {
dmesgln("Failed to restart the NVMe controller");
return EFAULT;
}
set_admin_queue_ready_flag();
m_admin_queue->enable_interrupts();
dbgln_if(NVME_DEBUG, "NVMe: Admin queue created");
return {};
}
ErrorOr<void> NVMeController::create_io_queue(u8 irq, u8 qid)
{
NVMeSubmission sub {};
OwnPtr<Memory::Region> cq_dma_region;
NonnullRefPtrVector<Memory::PhysicalPage> cq_dma_pages;
OwnPtr<Memory::Region> sq_dma_region;
NonnullRefPtrVector<Memory::PhysicalPage> sq_dma_pages;
auto cq_size = round_up_to_power_of_two(CQ_SIZE(IO_QUEUE_SIZE), 4096);
auto sq_size = round_up_to_power_of_two(SQ_SIZE(IO_QUEUE_SIZE), 4096);
static_assert(sizeof(NVMeSubmission) == (1 << SQ_WIDTH));
{
auto buffer = TRY(MM.allocate_dma_buffer_pages(cq_size, "IO CQ queue", Memory::Region::Access::ReadWrite, cq_dma_pages));
cq_dma_region = move(buffer);
}
// Phase bit is important to determine completion, so zero out the space
// so that we don't get any garbage phase bit value
memset(cq_dma_region->vaddr().as_ptr(), 0, cq_size);
{
auto buffer = TRY(MM.allocate_dma_buffer_pages(sq_size, "IO SQ queue", Memory::Region::Access::ReadWrite, sq_dma_pages));
sq_dma_region = move(buffer);
}
{
sub.op = OP_ADMIN_CREATE_COMPLETION_QUEUE;
sub.data_ptr.prp1 = reinterpret_cast<u64>(AK::convert_between_host_and_little_endian(cq_dma_pages.first().paddr().as_ptr()));
// The queue size is 0 based
sub.cdw10 = AK::convert_between_host_and_little_endian(((IO_QUEUE_SIZE - 1) << 16 | qid));
auto flags = QUEUE_IRQ_ENABLED | QUEUE_PHY_CONTIGUOUS;
// TODO: Eventually move to MSI.
// For now using pin based interrupts. Clear the first 16 bits
// to use pin-based interrupts.
sub.cdw11 = AK::convert_between_host_and_little_endian(flags & 0xFFFF);
submit_admin_command(sub, true);
}
{
sub.op = OP_ADMIN_CREATE_SUBMISSION_QUEUE;
sub.data_ptr.prp1 = reinterpret_cast<u64>(AK::convert_between_host_and_little_endian(sq_dma_pages.first().paddr().as_ptr()));
// The queue size is 0 based
sub.cdw10 = AK::convert_between_host_and_little_endian(((IO_QUEUE_SIZE - 1) << 16 | qid));
auto flags = QUEUE_IRQ_ENABLED | QUEUE_PHY_CONTIGUOUS;
// The qid used below points to the completion queue qid
sub.cdw11 = AK::convert_between_host_and_little_endian(qid << 16 | flags);
submit_admin_command(sub, true);
}
auto queue_doorbell_offset = REG_SQ0TDBL_START + ((2 * qid) * (4 << m_dbl_stride));
auto doorbell_regs = Memory::map_typed_writable<DoorbellRegister>(PhysicalAddress(m_bar + queue_doorbell_offset));
m_queues.append(TRY(NVMeQueue::try_create(qid, irq, IO_QUEUE_SIZE, move(cq_dma_region), cq_dma_pages, move(sq_dma_region), sq_dma_pages, move(doorbell_regs))));
m_queues.last().enable_interrupts();
dbgln_if(NVME_DEBUG, "NVMe: Created IO Queue with QID{}", m_queues.size());
return {};
}
}

View file

@ -0,0 +1,79 @@
/*
* Copyright (c) 2021, Pankaj R <pankydev8@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/NonnullRefPtr.h>
#include <AK/NonnullRefPtrVector.h>
#include <AK/OwnPtr.h>
#include <AK/RefPtr.h>
#include <AK/Tuple.h>
#include <AK/Types.h>
#include <Kernel/Bus/PCI/Device.h>
#include <Kernel/Locking/Spinlock.h>
#include <Kernel/Memory/TypedMapping.h>
#include <Kernel/Storage/NVMe/NVMeDefinitions.h>
#include <Kernel/Storage/NVMe/NVMeNameSpace.h>
#include <Kernel/Storage/NVMe/NVMeQueue.h>
#include <Kernel/Storage/StorageController.h>
namespace Kernel {
class NVMeController : public PCI::Device
, public StorageController {
public:
static ErrorOr<NonnullRefPtr<NVMeController>> try_initialize(PCI::DeviceIdentifier const&);
ErrorOr<void> initialize();
explicit NVMeController(PCI::DeviceIdentifier const&);
RefPtr<StorageDevice> device(u32 index) const override;
size_t devices_count() const override;
protected:
bool reset() override;
bool shutdown() override;
void complete_current_request(AsyncDeviceRequest::RequestResult result) override;
public:
bool reset_controller();
bool start_controller();
u32 get_admin_q_dept();
u16 submit_admin_command(struct NVMeSubmission& sub, bool sync = false)
{
// First queue is always the admin queue
if (sync) {
return m_admin_queue->submit_sync_sqe(sub);
}
m_admin_queue->submit_sqe(sub);
return 0;
}
bool is_admin_queue_ready() { return m_admin_queue_ready; };
void set_admin_queue_ready_flag() { m_admin_queue_ready = true; };
private:
ErrorOr<void> identify_and_init_namespaces();
Tuple<u64, u8> get_ns_features(IdentifyNamespace& identify_data_struct);
ErrorOr<void> create_admin_queue(u8 irq);
ErrorOr<void> create_io_queue(u8 irq, u8 qid);
void calculate_doorbell_stride()
{
m_dbl_stride = (m_controller_regs->cap >> CAP_DBL_SHIFT) & CAP_DBL_MASK;
}
private:
PCI::DeviceIdentifier m_pci_device_id;
RefPtr<NVMeQueue> m_admin_queue;
NonnullRefPtrVector<NVMeQueue> m_queues;
NonnullRefPtrVector<NVMeNameSpace> m_namespaces;
Memory::TypedMapping<ControllerRegister> m_controller_regs;
bool m_admin_queue_ready { false };
size_t m_device_count {};
u32 m_bar;
u8 m_dbl_stride;
static Atomic<u8> controller_id;
};
}

View file

@ -0,0 +1,148 @@
/*
* Copyright (c) 2021, Pankaj R <pankydev8@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Endian.h>
#include <AK/Types.h>
struct NVMeCompletion;
struct NVMeSubmission;
struct ControllerRegister {
u64 cap;
u32 vs;
u32 intms;
u32 intmc;
u32 cc;
u32 rsvd1;
u32 csts;
u32 nssr;
u32 aqa;
u64 asq;
u64 acq;
u64 rsvd2[505];
};
struct IdentifyNamespace {
u64 nsze;
u64 ncap;
u8 rsdv1[10];
u8 flbas;
u8 rsvd2[100];
u32 lbaf[16];
u64 rsvd3[488];
};
// BAR
static constexpr u32 BAR_ADDR_MASK = 0xFFFFFFF0;
// DOORBELL
static constexpr u32 REG_SQ0TDBL_START = 0x1000;
static constexpr u32 REG_SQ0TDBL_END = 0x1003;
static constexpr u8 DBL_REG_SIZE = 8;
// CAP
static constexpr u8 CAP_DBL_SHIFT = 32;
static constexpr u8 CAP_DBL_MASK = 0xf;
static constexpr u16 MQES(u64 cap)
{
return (cap & 0xffff) + 1;
}
// CC Controller Configuration
static constexpr u8 CC_EN_BIT = 0x0;
static constexpr u8 CSTS_RDY_BIT = 0x0;
static constexpr u8 CC_IOSQES_BIT = 16;
static constexpr u8 CC_IOCQES_BIT = 20;
static constexpr u16 CC_AQA_MASK = (0xfff);
static constexpr u16 ACQ_SIZE(u32 x)
{
return (x >> 16) & CC_AQA_MASK;
}
static constexpr u16 ASQ_SIZE(u32 x)
{
return x & CC_AQA_MASK;
}
static constexpr u8 CQ_WIDTH = 4; // CQ is 16 bytes(2^4) in size.
static constexpr u8 SQ_WIDTH = 6; // SQ size is 64 bytes(2^6) in size.
static constexpr u16 CQ_SIZE(u16 q_depth)
{
return q_depth << CQ_WIDTH;
}
static constexpr u16 SQ_SIZE(u16 q_depth)
{
return q_depth << SQ_WIDTH;
}
static constexpr u8 PHASE_TAG(u16 x)
{
return x & 0x1;
}
static constexpr u16 CQ_STATUS_FIELD_MASK = 0xfffe;
static constexpr u16 CQ_STATUS_FIELD(u16 x)
{
return (x & CQ_STATUS_FIELD_MASK) >> 1;
}
static constexpr u16 IO_QUEUE_SIZE = 64; // TODO:Need to be configurable
// IDENTIFY
static constexpr u16 NVMe_IDENTIFY_SIZE = 4096;
static constexpr u8 NVMe_CNS_ID_ACTIVE_NS = 0x2;
static constexpr u8 NVMe_CNS_ID_NS = 0x0;
static constexpr u8 FLBA_SIZE_INDEX = 26;
static constexpr u8 FLBA_SIZE_MASK = 0xf;
static constexpr u8 LBA_FORMAT_SUPPORT_INDEX = 128;
static constexpr u32 LBA_SIZE_MASK = 0x00ff0000;
// OPCODES
// ADMIN COMMAND SET
enum AdminCommandOpCode {
OP_ADMIN_CREATE_COMPLETION_QUEUE = 0x5,
OP_ADMIN_CREATE_SUBMISSION_QUEUE = 0x1,
OP_ADMIN_IDENTIFY = 0x6,
};
// IO opcodes
enum IOCommandOpcode {
OP_NVME_WRITE = 0x1,
OP_NVME_READ = 0x2
};
// FLAGS
static constexpr u8 QUEUE_PHY_CONTIGUOUS = (1 << 0);
static constexpr u8 QUEUE_IRQ_ENABLED = (1 << 1);
struct NVMeCompletion {
LittleEndian<u32> cmd_spec;
LittleEndian<u32> res;
LittleEndian<u16> sq_head; /* how much of this queue may be reclaimed */
LittleEndian<u16> sq_id; /* submission queue that generated this entry */
u16 command_id; /* of the command which completed */
LittleEndian<u16> status; /* did the command fail, and if so, why? */
};
struct DataPtr {
LittleEndian<u64> prp1;
LittleEndian<u64> prp2;
};
struct NVMeSubmission {
LittleEndian<u8> op;
LittleEndian<u8> flags;
LittleEndian<u16> cmdid;
LittleEndian<u32> nsid;
LittleEndian<u64> rsvd;
LittleEndian<u64> meta_ptr;
struct DataPtr data_ptr;
LittleEndian<u32> cdw10;
LittleEndian<u32> cdw11;
LittleEndian<u32> cdw12;
LittleEndian<u32> cdw13;
LittleEndian<u32> cdw14;
LittleEndian<u32> cdw15;
};

View file

@ -0,0 +1,47 @@
/*
* Copyright (c) 2021, Pankaj R <pankydev8@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include "NVMeNameSpace.h"
#include <AK/NonnullOwnPtr.h>
#include <Kernel/Devices/DeviceManagement.h>
#include <Kernel/Storage/NVMe/NVMeNameSpace.h>
#include <Kernel/Storage/StorageManagement.h>
namespace Kernel {
ErrorOr<NonnullRefPtr<NVMeNameSpace>> NVMeNameSpace::try_create(NonnullRefPtrVector<NVMeQueue> queues, u8 controller_id, u16 nsid, size_t storage_size, size_t lba_size)
{
auto minor_number = StorageManagement::generate_storage_minor_number();
auto major_number = StorageManagement::storage_type_major_number();
auto device_name = String::formatted("nvme{:d}n{:d}", controller_id, nsid);
auto device_name_kstring = KString::must_create(device_name.view());
auto device = TRY(DeviceManagement::try_create_device<NVMeNameSpace>(queues, storage_size, lba_size, major_number.value(), minor_number.value(), nsid, move(device_name_kstring)));
return device;
}
NVMeNameSpace::NVMeNameSpace(NonnullRefPtrVector<NVMeQueue> queues, size_t max_addresable_block, size_t lba_size, size_t major_number, size_t minor_number, u16 nsid, NonnullOwnPtr<KString> dev_name)
: StorageDevice(major_number, minor_number, lba_size, max_addresable_block, move(dev_name))
, m_nsid(nsid)
, m_queues(queues)
{
}
void NVMeNameSpace::start_request(AsyncBlockDeviceRequest& request)
{
auto index = Processor::current_id();
auto& queue = m_queues.at(index);
// TODO: For now we support only IO transfers of size PAGE_SIZE (Going along with the current constraint in the block layer)
// Eventually remove this constraint by using the PRP2 field in the submission struct and remove block layer constraint for NVMe driver.
VERIFY(request.block_count() <= (PAGE_SIZE / block_size()));
if (request.request_type() == AsyncBlockDeviceRequest::Read) {
queue.read(request, m_nsid, request.block_index(), request.block_count());
} else {
queue.write(request, m_nsid, request.block_index(), request.block_count());
}
}
}

View file

@ -0,0 +1,36 @@
/*
* Copyright (c) 2021, Pankaj R <pankydev8@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include "AK/kmalloc.h"
#include <AK/NonnullRefPtr.h>
#include <AK/NonnullRefPtrVector.h>
#include <AK/OwnPtr.h>
#include <AK/RefCounted.h>
#include <AK/RefPtr.h>
#include <AK/Types.h>
#include <Kernel/Locking/Spinlock.h>
#include <Kernel/Storage/NVMe/NVMeDefinitions.h>
#include <Kernel/Storage/NVMe/NVMeQueue.h>
#include <Kernel/Storage/StorageDevice.h>
namespace Kernel {
class NVMeNameSpace : public StorageDevice {
public:
static ErrorOr<NonnullRefPtr<NVMeNameSpace>> try_create(NonnullRefPtrVector<NVMeQueue> queues, u8 controller_id, u16 nsid, size_t storage_size, size_t lba_size);
explicit NVMeNameSpace(NonnullRefPtrVector<NVMeQueue> queues, size_t storage_size, size_t lba_size, size_t major_number, size_t minor_number, u16 nsid, NonnullOwnPtr<KString> early_device_name);
CommandSet command_set() const override { return CommandSet::NVMe; };
void start_request(AsyncBlockDeviceRequest& request) override;
private:
u16 m_nsid;
NonnullRefPtrVector<NVMeQueue> m_queues;
};
}

View file

@ -0,0 +1,205 @@
/*
* Copyright (c) 2021, Pankaj R <pankydev8@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include "NVMeQueue.h"
#include "Kernel/StdLib.h"
#include <Kernel/Arch/x86/IO.h>
#include <Kernel/Storage/NVMe/NVMeController.h>
#include <Kernel/Storage/NVMe/NVMeQueue.h>
#include <Kernel/WorkQueue.h>
namespace Kernel {
ErrorOr<NonnullRefPtr<NVMeQueue>> NVMeQueue::try_create(u16 qid, u8 irq, u32 q_depth, OwnPtr<Memory::Region> cq_dma_region, NonnullRefPtrVector<Memory::PhysicalPage> cq_dma_page, OwnPtr<Memory::Region> sq_dma_region, NonnullRefPtrVector<Memory::PhysicalPage> sq_dma_page, Memory::TypedMapping<DoorbellRegister> db_regs)
{
auto queue = TRY(adopt_nonnull_ref_or_enomem(new (nothrow) NVMeQueue(qid, irq, q_depth, move(cq_dma_region), cq_dma_page, move(sq_dma_region), sq_dma_page, move(db_regs))));
TRY(queue->create());
return queue;
}
NVMeQueue::NVMeQueue(u16 qid, u8 irq, u32 q_depth, OwnPtr<Memory::Region> cq_dma_region, NonnullRefPtrVector<Memory::PhysicalPage> cq_dma_page, OwnPtr<Memory::Region> sq_dma_region, NonnullRefPtrVector<Memory::PhysicalPage> sq_dma_page, Memory::TypedMapping<DoorbellRegister> db_regs)
: IRQHandler(irq)
, m_qid(qid)
, m_admin_queue(qid == 0)
, m_irq(irq)
, m_qdepth(q_depth)
, m_cq_dma_region(move(cq_dma_region))
, m_cq_dma_page(cq_dma_page)
, m_sq_dma_region(move(sq_dma_region))
, m_sq_dma_page(sq_dma_page)
, m_db_regs(move(db_regs))
, m_current_request(nullptr)
{
m_sqe_array = { reinterpret_cast<NVMeSubmission*>(m_sq_dma_region->vaddr().as_ptr()), m_qdepth };
m_cqe_array = { reinterpret_cast<NVMeCompletion*>(m_cq_dma_region->vaddr().as_ptr()), m_qdepth };
}
ErrorOr<void> NVMeQueue::create()
{
// DMA region for RW operation. For now the requests don't exceed more than 4096 bytes(Storage device takes of it)
auto buffer = TRY(MM.allocate_dma_buffer_page("Admin CQ queue", Memory::Region::Access::ReadWrite, m_rw_dma_page));
m_rw_dma_region = move(buffer);
return {};
}
bool NVMeQueue::cqe_available()
{
return PHASE_TAG(m_cqe_array[m_cq_head].status) == m_cq_valid_phase;
}
void NVMeQueue::update_cqe_head()
{
// To prevent overflow, use a temp variable
u32 temp_cq_head = m_cq_head + 1;
if (temp_cq_head == m_qdepth) {
m_cq_head = 0;
m_cq_valid_phase ^= 1;
} else {
m_cq_head = temp_cq_head;
}
}
bool NVMeQueue::handle_irq(const RegisterState&)
{
u32 nr_of_processed_cqes = 0;
while (cqe_available()) {
u16 status;
u16 cmdid;
++nr_of_processed_cqes;
status = CQ_STATUS_FIELD(m_cqe_array[m_cq_head].status);
cmdid = m_cqe_array[m_cq_head].command_id;
dbgln_if(NVME_DEBUG, "NVMe: Completion with status {:x} and command identifier {}. CQ_HEAD: {}", status, cmdid, m_cq_head);
// TODO: We don't use AsyncBlockDevice requests for admin queue as it is only applicable for a block device (NVMe namespace)
// But admin commands precedes namespace creation. Unify requests to avoid special conditions
if (m_admin_queue == false) {
// As the block layer calls are now sync (as we wait on each requests),
// everything is operated on a single request similar to BMIDE driver.
// TODO: Remove this constraint eventually.
VERIFY(cmdid == m_prev_sq_tail);
SpinlockLocker lock(m_request_lock);
if (m_current_request) {
complete_current_request(status);
}
}
update_cqe_head();
}
if (nr_of_processed_cqes) {
update_cq_doorbell();
}
return nr_of_processed_cqes ? true : false;
}
void NVMeQueue::submit_sqe(struct NVMeSubmission& sub)
{
SpinlockLocker lock(m_sq_lock);
// For now let's use sq tail as a unique command id.
sub.cmdid = m_sq_tail;
m_prev_sq_tail = m_sq_tail;
memcpy(&m_sqe_array[m_sq_tail], &sub, sizeof(NVMeSubmission));
{
u32 temp_sq_tail = m_sq_tail + 1;
if (temp_sq_tail == m_qdepth)
m_sq_tail = 0;
else
m_sq_tail = temp_sq_tail;
}
dbgln_if(NVME_DEBUG, "NVMe: Submission with command identifier {}. SQ_TAIL: {}", sub.cmdid, m_sq_tail);
full_memory_barrier();
update_sq_doorbell();
}
u16 NVMeQueue::submit_sync_sqe(NVMeSubmission& sub)
{
// For now let's use sq tail as a unique command id.
u16 cqe_cid;
u16 cid = m_sq_tail;
submit_sqe(sub);
do {
int index;
{
SpinlockLocker lock(m_cq_lock);
index = m_cq_head - 1;
if (index < 0)
index = IO_QUEUE_SIZE - 1;
}
cqe_cid = m_cqe_array[index].command_id;
Scheduler::yield();
} while (cid != cqe_cid);
auto status = CQ_STATUS_FIELD(m_cqe_array[m_cq_head].status);
return status;
}
void NVMeQueue::read(AsyncBlockDeviceRequest& request, u16 nsid, u64 index, u32 count)
{
NVMeSubmission sub {};
SpinlockLocker m_lock(m_request_lock);
m_current_request = request;
sub.op = OP_NVME_READ;
sub.nsid = nsid;
sub.cdw10 = AK::convert_between_host_and_little_endian(index & 0xFFFFFFFF);
sub.cdw11 = AK::convert_between_host_and_little_endian(index >> 32);
// No. of lbas is 0 based
sub.cdw12 = AK::convert_between_host_and_little_endian((count - 1) & 0xFFFF);
sub.data_ptr.prp1 = reinterpret_cast<u64>(AK::convert_between_host_and_little_endian(m_rw_dma_page->paddr().as_ptr()));
full_memory_barrier();
submit_sqe(sub);
}
void NVMeQueue::write(AsyncBlockDeviceRequest& request, u16 nsid, u64 index, u32 count)
{
NVMeSubmission sub {};
SpinlockLocker m_lock(m_request_lock);
m_current_request = request;
if (auto result = m_current_request->read_from_buffer(m_current_request->buffer(), m_rw_dma_region->vaddr().as_ptr(), 512 * m_current_request->block_count()); result.is_error()) {
complete_current_request(AsyncDeviceRequest::MemoryFault);
return;
}
sub.op = OP_NVME_WRITE;
sub.nsid = nsid;
sub.cdw10 = AK::convert_between_host_and_little_endian(index & 0xFFFFFFFF);
sub.cdw11 = AK::convert_between_host_and_little_endian(index >> 32);
// No. of lbas is 0 based
sub.cdw12 = AK::convert_between_host_and_little_endian((count - 1) & 0xFFFF);
sub.data_ptr.prp1 = reinterpret_cast<u64>(AK::convert_between_host_and_little_endian(m_rw_dma_page->paddr().as_ptr()));
full_memory_barrier();
submit_sqe(sub);
}
void NVMeQueue::complete_current_request(u16 status)
{
VERIFY(m_request_lock.is_locked());
g_io_work->queue([this, status]() {
SpinlockLocker lock(m_request_lock);
auto current_request = m_current_request;
m_current_request.clear();
if (status) {
lock.unlock();
current_request->complete(AsyncBlockDeviceRequest::Failure);
return;
}
if (current_request->request_type() == AsyncBlockDeviceRequest::RequestType::Read) {
if (auto result = current_request->write_to_buffer(current_request->buffer(), m_rw_dma_region->vaddr().as_ptr(), 512 * current_request->block_count()); result.is_error()) {
lock.unlock();
current_request->complete(AsyncDeviceRequest::MemoryFault);
return;
}
}
lock.unlock();
current_request->complete(AsyncDeviceRequest::Success);
return;
});
}
}

View file

@ -0,0 +1,81 @@
/*
* Copyright (c) 2021, Pankaj R <pankydev8@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/NonnullRefPtrVector.h>
#include <AK/OwnPtr.h>
#include <AK/RefCounted.h>
#include <AK/RefPtr.h>
#include <AK/Types.h>
#include <Kernel/Bus/PCI/Device.h>
#include <Kernel/Interrupts/IRQHandler.h>
#include <Kernel/Locking/Spinlock.h>
#include <Kernel/Memory/MemoryManager.h>
#include <Kernel/Memory/TypedMapping.h>
#include <Kernel/Storage/NVMe/NVMeDefinitions.h>
namespace Kernel {
struct DoorbellRegister {
u32 sq_tail;
u32 cq_head;
};
class AsyncBlockDeviceRequest;
class NVMeQueue : public IRQHandler
, public RefCounted<NVMeQueue> {
public:
static ErrorOr<NonnullRefPtr<NVMeQueue>> try_create(u16 qid, u8 irq, u32 q_depth, OwnPtr<Memory::Region> cq_dma_region, NonnullRefPtrVector<Memory::PhysicalPage> cq_dma_page, OwnPtr<Memory::Region> sq_dma_region, NonnullRefPtrVector<Memory::PhysicalPage> sq_dma_page, Memory::TypedMapping<DoorbellRegister> db_regs);
ErrorOr<void> create();
explicit NVMeQueue(u16 qid, u8 irq, u32 q_depth, OwnPtr<Memory::Region> cq_dma_region, NonnullRefPtrVector<Memory::PhysicalPage> cq_dma_page, OwnPtr<Memory::Region> sq_dma_region, NonnullRefPtrVector<Memory::PhysicalPage> sq_dma_page, Memory::TypedMapping<DoorbellRegister> db_regs);
bool is_admin_queue() { return m_admin_queue; };
bool handle_irq(const RegisterState&) override;
void submit_sqe(struct NVMeSubmission&);
u16 submit_sync_sqe(struct NVMeSubmission&);
void read(AsyncBlockDeviceRequest& request, u16 nsid, u64 index, u32 count);
void write(AsyncBlockDeviceRequest& request, u16 nsid, u64 index, u32 count);
void enable_interrupts() { enable_irq(); };
void disable_interrupts() { disable_irq(); };
private:
bool cqe_available();
void update_cqe_head();
void complete_current_request(u16 status);
void update_cq_doorbell()
{
m_db_regs->cq_head = m_cq_head;
}
void update_sq_doorbell()
{
m_db_regs->sq_tail = m_sq_tail;
}
private:
u16 m_qid {};
u8 m_cq_valid_phase { 1 };
u16 m_sq_tail {};
u16 m_prev_sq_tail {};
u16 m_cq_head {};
bool m_admin_queue { false };
u8 m_irq {};
u32 m_qdepth {};
Spinlock m_cq_lock { LockRank::Interrupts };
Spinlock m_sq_lock { LockRank::Interrupts };
OwnPtr<Memory::Region> m_cq_dma_region;
NonnullRefPtrVector<Memory::PhysicalPage> m_cq_dma_page;
Span<NVMeSubmission> m_sqe_array;
OwnPtr<Memory::Region> m_sq_dma_region;
NonnullRefPtrVector<Memory::PhysicalPage> m_sq_dma_page;
Span<NVMeCompletion> m_cqe_array;
OwnPtr<Memory::Region> m_rw_dma_region;
Memory::TypedMapping<DoorbellRegister> m_db_regs;
RefPtr<Memory::PhysicalPage> m_rw_dma_page;
Spinlock m_request_lock;
RefPtr<AsyncBlockDeviceRequest> m_current_request;
};
}

View file

@ -15,6 +15,7 @@
#include <Kernel/Panic.h>
#include <Kernel/Storage/ATA/AHCIController.h>
#include <Kernel/Storage/ATA/IDEController.h>
#include <Kernel/Storage/NVMe/NVMeController.h>
#include <Kernel/Storage/Partition/EBRPartitionTable.h>
#include <Kernel/Storage/Partition/GUIDPartitionTable.h>
#include <Kernel/Storage/Partition/MBRPartitionTable.h>
@ -61,6 +62,17 @@ UNMAP_AFTER_INIT void StorageManagement::enumerate_controllers(bool force_pio)
m_controllers.append(AHCIController::initialize(device_identifier));
}
});
PCI::enumerate([&](PCI::DeviceIdentifier const& device_identifier) {
if (device_identifier.class_code().value() == to_underlying(PCI::ClassID::MassStorage)
&& device_identifier.subclass_code().value() == to_underlying(PCI::MassStorage::SubclassID::NVMeController)) {
auto controller = NVMeController::try_initialize(device_identifier);
if (controller.is_error()) {
dmesgln("Unable to initialize NVMe controller");
} else {
m_controllers.append(controller.release_value());
}
}
});
}
m_controllers.append(RamdiskController::initialize());
}

View file

@ -123,6 +123,7 @@ set(MULTIPROCESSOR_DEBUG ON)
set(NE2000_DEBUG ON)
set(NETWORK_TASK_DEBUG ON)
set(NT_DEBUG ON)
set(NVME_DEBUG ON)
set(OCCLUSIONS_DEBUG ON)
set(OFFD_DEBUG ON)
set(PAGE_FAULT_DEBUG ON)