serenity/Kernel/Net/TCPSocket.cpp

/*
 * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <AK/Singleton.h>
#include <AK/Time.h>
#include <Kernel/Debug.h>
#include <Kernel/Devices/Generic/RandomDevice.h>
#include <Kernel/FileSystem/OpenFileDescription.h>
#include <Kernel/Locking/MutexProtected.h>
#include <Kernel/Net/EthernetFrameHeader.h>
#include <Kernel/Net/IPv4.h>
#include <Kernel/Net/NetworkAdapter.h>
#include <Kernel/Net/NetworkingManagement.h>
#include <Kernel/Net/Routing.h>
#include <Kernel/Net/TCP.h>
#include <Kernel/Net/TCPSocket.h>
#include <Kernel/Security/Random.h>
#include <Kernel/Tasks/Process.h>
#include <Kernel/Time/TimeManagement.h>

namespace Kernel {

void TCPSocket::for_each(Function<void(TCPSocket const&)> callback)
{
    sockets_by_tuple().for_each_shared([&](auto const& it) {
        callback(*it.value);
    });
}

ErrorOr<void> TCPSocket::try_for_each(Function<ErrorOr<void>(TCPSocket const&)> callback)
{
    return sockets_by_tuple().with_shared([&](auto const& sockets) -> ErrorOr<void> {
        for (auto& it : sockets)
            TRY(callback(*it.value));
        return {};
    });
}

bool TCPSocket::unref() const
{
    bool did_hit_zero = sockets_by_tuple().with_exclusive([&](auto& table) {
        if (deref_base())
            return false;
        table.remove(tuple());
        const_cast<TCPSocket&>(*this).revoke_weak_ptrs();
        return true;
    });
    if (did_hit_zero) {
        const_cast<TCPSocket&>(*this).will_be_destroyed();
        delete this;
    }
    return did_hit_zero;
}

void TCPSocket::set_state(State new_state)
{
    dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket({}) state moving from {} to {}", this, to_string(m_state), to_string(new_state));

    auto was_disconnected = protocol_is_disconnected();
    auto previous_role = m_role;

    m_state = new_state;

    if (new_state == State::Established && m_direction == Direction::Outgoing) {
        set_role(Role::Connected);
        clear_so_error();
    }

    if (new_state == State::TimeWait) {
        // Once we hit TimeWait, we are only holding the socket in case there
        // are packets on the way which we wouldn't want a new socket to get hit
        // with, so there's no point in keeping the receive buffer around.
        drop_receive_buffer();
    }

    if (new_state == State::Closed) {
        closing_sockets().with_exclusive([&](auto& table) {
            table.remove(tuple());
        });

        if (m_originator)
            release_to_originator();
    }

    if (previous_role != m_role || was_disconnected != protocol_is_disconnected())
        evaluate_block_conditions();
}

static Singleton<MutexProtected<HashMap<IPv4SocketTuple, RefPtr<TCPSocket>>>> s_socket_closing;

MutexProtected<HashMap<IPv4SocketTuple, RefPtr<TCPSocket>>>& TCPSocket::closing_sockets()
{
    return *s_socket_closing;
}

static Singleton<MutexProtected<HashMap<IPv4SocketTuple, TCPSocket*>>> s_socket_tuples;

MutexProtected<HashMap<IPv4SocketTuple, TCPSocket*>>& TCPSocket::sockets_by_tuple()
{
    return *s_socket_tuples;
}

RefPtr<TCPSocket> TCPSocket::from_tuple(IPv4SocketTuple const& tuple)
{
    return sockets_by_tuple().with_shared([&](auto const& table) -> RefPtr<TCPSocket> {
        auto exact_match = table.get(tuple);
        if (exact_match.has_value())
            return { *exact_match.value() };

        auto address_tuple = IPv4SocketTuple(tuple.local_address(), tuple.local_port(), IPv4Address(), 0);
        auto address_match = table.get(address_tuple);
        if (address_match.has_value())
            return { *address_match.value() };

        auto wildcard_tuple = IPv4SocketTuple(IPv4Address(), tuple.local_port(), IPv4Address(), 0);
        auto wildcard_match = table.get(wildcard_tuple);
        if (wildcard_match.has_value())
            return { *wildcard_match.value() };

        return {};
    });
}
ErrorOr<NonnullRefPtr<TCPSocket>> TCPSocket::try_create_client(IPv4Address const& new_local_address, u16 new_local_port, IPv4Address const& new_peer_address, u16 new_peer_port)
{
    auto tuple = IPv4SocketTuple(new_local_address, new_local_port, new_peer_address, new_peer_port);
    return sockets_by_tuple().with_exclusive([&](auto& table) -> ErrorOr<NonnullRefPtr<TCPSocket>> {
        if (table.contains(tuple))
            return EEXIST;

        auto receive_buffer = TRY(try_create_receive_buffer());
        auto client = TRY(TCPSocket::try_create(protocol(), move(receive_buffer)));

        client->set_setup_state(SetupState::InProgress);
        client->set_local_address(new_local_address);
        client->set_local_port(new_local_port);
        client->set_peer_address(new_peer_address);
        client->set_peer_port(new_peer_port);
        client->set_bound(true);
        client->set_direction(Direction::Incoming);
        client->set_originator(*this);

        m_pending_release_for_accept.set(tuple, client);
        table.set(tuple, client);

        return { move(client) };
    });
}

void TCPSocket::release_to_originator()
{
    VERIFY(!!m_originator);
    m_originator.strong_ref()->release_for_accept(*this);
    m_originator.clear();
}

void TCPSocket::release_for_accept(NonnullRefPtr<TCPSocket> socket)
{
    VERIFY(m_pending_release_for_accept.contains(socket->tuple()));
    m_pending_release_for_accept.remove(socket->tuple());
    // FIXME: Should we observe this error somehow?
    [[maybe_unused]] auto rc = queue_connection_from(move(socket));
}

TCPSocket::TCPSocket(int protocol, NonnullOwnPtr<DoubleBuffer> receive_buffer, NonnullOwnPtr<KBuffer> scratch_buffer)
    : IPv4Socket(SOCK_STREAM, protocol, move(receive_buffer), move(scratch_buffer))
    , m_last_ack_sent_time(TimeManagement::the().monotonic_time())
    , m_last_retransmit_time(TimeManagement::the().monotonic_time())
{
}

TCPSocket::~TCPSocket()
{
    dequeue_for_retransmit();

    dbgln_if(TCP_SOCKET_DEBUG, "~TCPSocket in state {}", to_string(state()));
}

ErrorOr<NonnullRefPtr<TCPSocket>> TCPSocket::try_create(int protocol, NonnullOwnPtr<DoubleBuffer> receive_buffer)
{
    // Note: Scratch buffer is only used for SOCK_STREAM sockets.
    auto scratch_buffer = TRY(KBuffer::try_create_with_size("TCPSocket: Scratch buffer"sv, 65536));
    return adopt_nonnull_ref_or_enomem(new (nothrow) TCPSocket(protocol, move(receive_buffer), move(scratch_buffer)));
}

ErrorOr<size_t> TCPSocket::protocol_size(ReadonlyBytes raw_ipv4_packet)
{
    auto& ipv4_packet = *reinterpret_cast<IPv4Packet const*>(raw_ipv4_packet.data());
    auto& tcp_packet = *static_cast<TCPPacket const*>(ipv4_packet.payload());
    return raw_ipv4_packet.size() - sizeof(IPv4Packet) - tcp_packet.header_size();
}

ErrorOr<size_t> TCPSocket::protocol_receive(ReadonlyBytes raw_ipv4_packet, UserOrKernelBuffer& buffer, size_t buffer_size, [[maybe_unused]] int flags)
{
    auto& ipv4_packet = *reinterpret_cast<IPv4Packet const*>(raw_ipv4_packet.data());
    auto& tcp_packet = *static_cast<TCPPacket const*>(ipv4_packet.payload());
    size_t payload_size = raw_ipv4_packet.size() - sizeof(IPv4Packet) - tcp_packet.header_size();
    dbgln_if(TCP_SOCKET_DEBUG, "payload_size {}, will it fit in {}?", payload_size, buffer_size);
    VERIFY(buffer_size >= payload_size);
    SOCKET_TRY(buffer.write(tcp_packet.payload(), payload_size));
    return payload_size;
}

ErrorOr<size_t> TCPSocket::protocol_send(UserOrKernelBuffer const& data, size_t data_length)
{
    auto adapter = bound_interface().with([](auto& bound_device) -> RefPtr<NetworkAdapter> { return bound_device; });
    RoutingDecision routing_decision = route_to(peer_address(), local_address(), adapter);
    if (routing_decision.is_zero())
        return set_so_error(EHOSTUNREACH);
    size_t mss = routing_decision.adapter->mtu() - sizeof(IPv4Packet) - sizeof(TCPPacket);

    // RFC 896 (Nagle’s algorithm): https://www.ietf.org/rfc/rfc0896
    // "The solution is to inhibit the sending of new TCP  segments when
    //  new  outgoing  data  arrives  from  the  user  if  any previously
    //  transmitted data on the connection remains unacknowledged.   This
    //  inhibition  is  to be unconditional; no timers, tests for size of
    //  data received, or other conditions are required."
    // FIXME: Make this configurable via TCP_NODELAY.
    auto has_unacked_data = m_unacked_packets.with_shared([&](auto const& packets) { return packets.size > 0; });
    if (has_unacked_data && data_length < mss)
        return 0;

    data_length = min(data_length, mss);
    TRY(send_tcp_packet(TCPFlags::PSH | TCPFlags::ACK, &data, data_length, &routing_decision));
    return data_length;
}

ErrorOr<void> TCPSocket::send_ack(bool allow_duplicate)
{
    if (!allow_duplicate && m_last_ack_number_sent == m_ack_number)
        return {};
    return send_tcp_packet(TCPFlags::ACK);
}

ErrorOr<void> TCPSocket::send_tcp_packet(u16 flags, UserOrKernelBuffer const* payload, size_t payload_size, RoutingDecision* user_routing_decision)
{
    auto adapter = bound_interface().with([](auto& bound_device) -> RefPtr<NetworkAdapter> { return bound_device; });
    RoutingDecision routing_decision = user_routing_decision ? *user_routing_decision : route_to(peer_address(), local_address(), adapter);
    if (routing_decision.is_zero())
        return set_so_error(EHOSTUNREACH);

    auto ipv4_payload_offset = routing_decision.adapter->ipv4_payload_offset();

    bool const has_mss_option = flags == TCPFlags::SYN;
    const size_t options_size = has_mss_option ? sizeof(TCPOptionMSS) : 0;
    const size_t tcp_header_size = sizeof(TCPPacket) + options_size;
    const size_t buffer_size = ipv4_payload_offset + tcp_header_size + payload_size;
    auto packet = routing_decision.adapter->acquire_packet_buffer(buffer_size);
    if (!packet)
        return set_so_error(ENOMEM);
    routing_decision.adapter->fill_in_ipv4_header(*packet, local_address(),
        routing_decision.next_hop, peer_address(), IPv4Protocol::TCP,
        buffer_size - ipv4_payload_offset, type_of_service(), ttl());
    memset(packet->buffer->data() + ipv4_payload_offset, 0, sizeof(TCPPacket));
    auto& tcp_packet = *(TCPPacket*)(packet->buffer->data() + ipv4_payload_offset);
    VERIFY(local_port());
    tcp_packet.set_source_port(local_port());
    tcp_packet.set_destination_port(peer_port());
    tcp_packet.set_window_size(NumericLimits<u16>::max());
    tcp_packet.set_sequence_number(m_sequence_number);
    tcp_packet.set_data_offset(tcp_header_size / sizeof(u32));
    tcp_packet.set_flags(flags);

    if (payload) {
        if (auto result = payload->read(tcp_packet.payload(), payload_size); result.is_error()) {
            routing_decision.adapter->release_packet_buffer(*packet);
            return set_so_error(result.release_error());
        }
    }

    if (flags & TCPFlags::ACK) {
        m_last_ack_number_sent = m_ack_number;
        m_last_ack_sent_time = TimeManagement::the().monotonic_time();
        tcp_packet.set_ack_number(m_ack_number);
    }

    if (flags & TCPFlags::SYN) {
        ++m_sequence_number;
    } else {
        m_sequence_number += payload_size;
    }

    if (has_mss_option) {
        u16 mss = routing_decision.adapter->mtu() - sizeof(IPv4Packet) - sizeof(TCPPacket);
        TCPOptionMSS mss_option { mss };
        VERIFY(packet->buffer->size() >= ipv4_payload_offset + sizeof(TCPPacket) + sizeof(mss_option));
        memcpy(packet->buffer->data() + ipv4_payload_offset + sizeof(TCPPacket), &mss_option, sizeof(mss_option));
    }

    tcp_packet.set_checksum(compute_tcp_checksum(local_address(), peer_address(), tcp_packet, payload_size));

    bool expect_ack { tcp_packet.has_syn() || payload_size > 0 };
    if (expect_ack) {
        bool append_failed { false };
        m_unacked_packets.with_exclusive([&](auto& unacked_packets) {
            auto result = unacked_packets.packets.try_append({ m_sequence_number, packet, ipv4_payload_offset, *routing_decision.adapter });
            if (result.is_error()) {
                dbgln("TCPSocket: Dropped outbound packet because try_append() failed");
                append_failed = true;
                return;
            }
            unacked_packets.size += payload_size;
            enqueue_for_retransmit();
        });
        if (append_failed)
            return set_so_error(ENOMEM);
    }

    m_packets_out++;
    m_bytes_out += buffer_size;
    routing_decision.adapter->send_packet(packet->bytes());
    if (!expect_ack)
        routing_decision.adapter->release_packet_buffer(*packet);

    return {};
}

void TCPSocket::receive_tcp_packet(TCPPacket const& packet, u16 size)
{
    if (packet.has_ack()) {
        u32 ack_number = packet.ack_number();

        dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket: receive_tcp_packet: {}", ack_number);

        int removed = 0;
        m_unacked_packets.with_exclusive([&](auto& unacked_packets) {
            while (!unacked_packets.packets.is_empty()) {
                auto& packet = unacked_packets.packets.first();

                dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket: iterate: {}", packet.ack_number);

                if (packet.ack_number <= ack_number) {
                    auto old_adapter = packet.adapter.strong_ref();
                    if (old_adapter)
                        old_adapter->release_packet_buffer(*packet.buffer);
                    TCPPacket& tcp_packet = *(TCPPacket*)(packet.buffer->buffer->data() + packet.ipv4_payload_offset);
                    if (m_send_window_size != tcp_packet.window_size()) {
                        m_send_window_size = tcp_packet.window_size();
                    }
                    auto payload_size = packet.buffer->buffer->data() + packet.buffer->buffer->size() - (u8*)tcp_packet.payload();
                    unacked_packets.size -= payload_size;
                    evaluate_block_conditions();
                    unacked_packets.packets.take_first();
                    removed++;
                } else {
                    break;
                }
            }

            if (unacked_packets.packets.is_empty()) {
                m_retransmit_attempts = 0;
                dequeue_for_retransmit();
            }

            dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket: receive_tcp_packet acknowledged {} packets", removed);
        });
    }

    m_packets_in++;
    m_bytes_in += packet.header_size() + size;
}

bool TCPSocket::should_delay_next_ack() const
{
    // FIXME: We don't know the MSS here so make a reasonable guess.
    const size_t mss = 1500;

    // RFC 1122 says we should send an ACK for every two full-sized segments.
    if (m_ack_number >= m_last_ack_number_sent + 2 * mss)
        return false;

    // RFC 1122 says we should not delay ACKs for more than 500 milliseconds.
    if (TimeManagement::the().monotonic_time(TimePrecision::Precise) >= m_last_ack_sent_time + Duration::from_milliseconds(500))
        return false;

    return true;
}

NetworkOrdered<u16> TCPSocket::compute_tcp_checksum(IPv4Address const& source, IPv4Address const& destination, TCPPacket const& packet, u16 payload_size)
{
    union PseudoHeader {
        struct [[gnu::packed]] {
            IPv4Address source;
            IPv4Address destination;
            u8 zero;
            u8 protocol;
            NetworkOrdered<u16> payload_size;
        } header;
        u16 raw[6];
    };
    static_assert(sizeof(PseudoHeader) == 12);

    Checked<u16> packet_size = packet.header_size();
    packet_size += payload_size;
    VERIFY(!packet_size.has_overflow());

    PseudoHeader pseudo_header { .header = { source, destination, 0, (u8)IPv4Protocol::TCP, packet_size.value() } };

    u32 checksum = 0;
    auto* raw_pseudo_header = pseudo_header.raw;
    for (size_t i = 0; i < sizeof(pseudo_header) / sizeof(u16); ++i) {
        checksum += AK::convert_between_host_and_network_endian(raw_pseudo_header[i]);
        if (checksum > 0xffff)
            checksum = (checksum >> 16) + (checksum & 0xffff);
    }
    auto* raw_packet = bit_cast<u16*>(&packet);
    for (size_t i = 0; i < packet.header_size() / sizeof(u16); ++i) {
        checksum += AK::convert_between_host_and_network_endian(raw_packet[i]);
        if (checksum > 0xffff)
            checksum = (checksum >> 16) + (checksum & 0xffff);
    }
    VERIFY(packet.data_offset() * 4 == packet.header_size());
    auto* raw_payload = bit_cast<u16*>(packet.payload());
    for (size_t i = 0; i < payload_size / sizeof(u16); ++i) {
        checksum += AK::convert_between_host_and_network_endian(raw_payload[i]);
        if (checksum > 0xffff)
            checksum = (checksum >> 16) + (checksum & 0xffff);
    }
    if (payload_size & 1) {
        u16 expanded_byte = ((u8 const*)packet.payload())[payload_size - 1] << 8;
        checksum += expanded_byte;
        if (checksum > 0xffff)
            checksum = (checksum >> 16) + (checksum & 0xffff);
    }
    return ~(checksum & 0xffff);
}

ErrorOr<void> TCPSocket::protocol_bind()
{
    dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket::protocol_bind(), local_port() is {}", local_port());
    // Check that we do have the address we're trying to bind to.
    TRY(m_adapter.with([this](auto& adapter) -> ErrorOr<void> {
        if (has_specific_local_address() && !adapter) {
            adapter = NetworkingManagement::the().from_ipv4_address(local_address());
            if (!adapter)
                return set_so_error(EADDRNOTAVAIL);
        }
        return {};
    }));

    if (local_port() == 0) {
        // Allocate an unused ephemeral port.
        constexpr u16 first_ephemeral_port = 32768;
        constexpr u16 last_ephemeral_port = 60999;
        constexpr u16 ephemeral_port_range_size = last_ephemeral_port - first_ephemeral_port;
        u16 first_scan_port = first_ephemeral_port + get_good_random<u16>() % ephemeral_port_range_size;

        return sockets_by_tuple().with_exclusive([&](auto& table) -> ErrorOr<void> {
            u16 port = first_scan_port;
            while (true) {
                IPv4SocketTuple proposed_tuple(local_address(), port, peer_address(), peer_port());

                auto it = table.find(proposed_tuple);
                if (it == table.end()) {
                    set_local_port(port);
                    table.set(proposed_tuple, this);
                    dbgln_if(TCP_SOCKET_DEBUG, "...allocated port {}, tuple {}", port, proposed_tuple.to_string());
                    return {};
                }
                ++port;
                if (port > last_ephemeral_port)
                    port = first_ephemeral_port;
                if (port == first_scan_port)
                    break;
            }
            return set_so_error(EADDRINUSE);
        });
    } else {
        // Verify that the user-supplied port is not already used by someone else.
        bool ok = sockets_by_tuple().with_exclusive([&](auto& table) -> bool {
            if (table.contains(tuple()))
                return false;
            table.set(tuple(), this);
            return true;
        });
        if (!ok)
            return set_so_error(EADDRINUSE);
        return {};
    }
}

ErrorOr<void> TCPSocket::protocol_listen()
{
    set_direction(Direction::Passive);
    set_state(State::Listen);
    set_setup_state(SetupState::Completed);
    return {};
}

ErrorOr<void> TCPSocket::protocol_connect(OpenFileDescription& description)
{
    MutexLocker locker(mutex());

    auto routing_decision = route_to(peer_address(), local_address());
    if (routing_decision.is_zero())
        return set_so_error(EHOSTUNREACH);
    if (!has_specific_local_address())
        set_local_address(routing_decision.adapter->ipv4_address());

    TRY(ensure_bound());

    m_sequence_number = get_good_random<u32>();
    m_ack_number = 0;

    set_setup_state(SetupState::InProgress);
    TRY(send_tcp_packet(TCPFlags::SYN));
    m_state = State::SynSent;
    set_role(Role::Connecting);
    m_direction = Direction::Outgoing;

    evaluate_block_conditions();

    if (description.is_blocking()) {
        locker.unlock();
        auto unblock_flags = Thread::FileBlocker::BlockFlags::None;
        if (Thread::current()->block<Thread::ConnectBlocker>({}, description, unblock_flags).was_interrupted())
            return set_so_error(EINTR);
        locker.lock();
        VERIFY(setup_state() == SetupState::Completed);
        if (has_error()) { // TODO: check unblock_flags
            set_role(Role::None);
            if (error() == TCPSocket::Error::RetransmitTimeout)
                return set_so_error(ETIMEDOUT);
            else
                return set_so_error(ECONNREFUSED);
        }
        return {};
    }

    return set_so_error(EINPROGRESS);
}

bool TCPSocket::protocol_is_disconnected() const
{
    switch (m_state) {
    case State::Closed:
    case State::CloseWait:
    case State::LastAck:
    case State::FinWait1:
    case State::FinWait2:
    case State::Closing:
    case State::TimeWait:
        return true;
    default:
        return false;
    }
}

void TCPSocket::shut_down_for_writing()
{
    if (state() == State::Established) {
        dbgln_if(TCP_SOCKET_DEBUG, " Sending FIN from Established and moving into FinWait1");
        (void)send_tcp_packet(TCPFlags::FIN | TCPFlags::ACK);
        set_state(State::FinWait1);
    } else {
        dbgln(" Shutting down TCPSocket for writing but not moving to FinWait1 since state is {}", to_string(state()));
    }
}

ErrorOr<void> TCPSocket::close()
{
    MutexLocker locker(mutex());
    auto result = IPv4Socket::close();
    if (state() == State::CloseWait) {
        dbgln_if(TCP_SOCKET_DEBUG, " Sending FIN from CloseWait and moving into LastAck");
        [[maybe_unused]] auto rc = send_tcp_packet(TCPFlags::FIN | TCPFlags::ACK);
        set_state(State::LastAck);
    }

    if (state() != State::Closed && state() != State::Listen)
        closing_sockets().with_exclusive([&](auto& table) {
            table.set(tuple(), *this);
        });
    return result;
}

static Singleton<MutexProtected<TCPSocket::RetransmitList>> s_sockets_for_retransmit;

MutexProtected<TCPSocket::RetransmitList>& TCPSocket::sockets_for_retransmit()
{
    return *s_sockets_for_retransmit;
}

void TCPSocket::enqueue_for_retransmit()
{
    sockets_for_retransmit().with_exclusive([&](auto& list) {
        list.append(*this);
    });
}

void TCPSocket::dequeue_for_retransmit()
{
    sockets_for_retransmit().with_exclusive([&](auto& list) {
        list.remove(*this);
    });
}

void TCPSocket::retransmit_packets()
{
    auto now = TimeManagement::the().monotonic_time();

    // RFC6298 says we should have at least one second between retransmits. According to
    // RFC1122 we must do exponential backoff - even for SYN packets.
    i64 retransmit_interval = 1;
    for (decltype(m_retransmit_attempts) i = 0; i < m_retransmit_attempts; i++)
        retransmit_interval *= 2;

    if (m_last_retransmit_time > now - Duration::from_seconds(retransmit_interval))
        return;

    dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket({}) handling retransmit", this);

    m_last_retransmit_time = now;
    ++m_retransmit_attempts;

    if (m_retransmit_attempts > maximum_retransmits) {
        set_state(TCPSocket::State::Closed);
        set_error(TCPSocket::Error::RetransmitTimeout);
        set_setup_state(Socket::SetupState::Completed);
        return;
    }

    auto adapter = bound_interface().with([](auto& bound_device) -> RefPtr<NetworkAdapter> { return bound_device; });
    auto routing_decision = route_to(peer_address(), local_address(), adapter);
    if (routing_decision.is_zero())
        return;

    m_unacked_packets.with_exclusive([&](auto& unacked_packets) {
        for (auto& packet : unacked_packets.packets) {
            packet.tx_counter++;

            if constexpr (TCP_SOCKET_DEBUG) {
                auto& tcp_packet = *(const TCPPacket*)(packet.buffer->buffer->data() + packet.ipv4_payload_offset);
                dbgln("Sending TCP packet from {}:{} to {}:{} with ({}{}{}{}) seq_no={}, ack_no={}, tx_counter={}",
                    local_address(), local_port(),
                    peer_address(), peer_port(),
                    (tcp_packet.has_syn() ? "SYN " : ""),
                    (tcp_packet.has_ack() ? "ACK " : ""),
                    (tcp_packet.has_fin() ? "FIN " : ""),
                    (tcp_packet.has_rst() ? "RST " : ""),
                    tcp_packet.sequence_number(),
                    tcp_packet.ack_number(),
                    packet.tx_counter);
            }

            size_t ipv4_payload_offset = routing_decision.adapter->ipv4_payload_offset();
            if (ipv4_payload_offset != packet.ipv4_payload_offset) {
                // FIXME: Add support for this. This can happen if after a route change
                // we ended up on another adapter which doesn't have the same layer 2 type
                // like the previous adapter.
                VERIFY_NOT_REACHED();
            }

            auto packet_buffer = packet.buffer->bytes();

            routing_decision.adapter->fill_in_ipv4_header(*packet.buffer,
                local_address(), routing_decision.next_hop, peer_address(),
                IPv4Protocol::TCP, packet_buffer.size() - ipv4_payload_offset, type_of_service(), ttl());
            routing_decision.adapter->send_packet(packet_buffer);
            m_packets_out++;
            m_bytes_out += packet_buffer.size();
        }
    });
}

bool TCPSocket::can_write(OpenFileDescription const& file_description, u64 size) const
{
    if (!IPv4Socket::can_write(file_description, size))
        return false;

    if (m_state == State::SynSent || m_state == State::SynReceived)
        return false;

    if (!file_description.is_blocking())
        return true;

    return m_unacked_packets.with_shared([&](auto& unacked_packets) {
        return unacked_packets.size + size <= m_send_window_size;
    });
}
}
-												Meta: Add license header to source files

As suggested by Joshua, this commit adds the 2-clause BSD license as a
comment block to the top of every source file.

For the first pass, I've just added myself for simplicity. I encourage
everyone to add themselves as copyright holders of any file they've
added or modified in some significant way. If I've added myself in
error somewhere, feel free to replace it with the appropriate copyright
holder instead.

Going forward, all new source files should include a license header.

											
										
										
											2020-01-18 08:38:21 +00:00
+								/*
 								 * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
 								 *
-												Everything: Move to SPDX license identifiers in all files.

SPDX License Identifiers are a more compact / standardized
way of representing file license information.

See: https://spdx.dev/resources/use/#identifiers

This was done with the `ambr` search and replace tool.

 ambr --no-parent-ignore --key-from-file --rep-from-file key.txt rep.txt *

											
										
										
											2021-04-22 08:24:48 +00:00
+								 * SPDX-License-Identifier: BSD-2-Clause
-												Meta: Add license header to source files

As suggested by Joshua, this commit adds the 2-clause BSD license as a
comment block to the top of every source file.

For the first pass, I've just added myself for simplicity. I encourage
everyone to add themselves as copyright holders of any file they've
added or modified in some significant way. If I've added myself in
error somewhere, feel free to replace it with the appropriate copyright
holder instead.

Going forward, all new source files should include a license header.

											
										
										
											2020-01-18 08:38:21 +00:00
+								 */
-												Kernel: Switch singletons to use new Singleton class

MemoryManager cannot use the Singleton class because
MemoryManager::initialize is called before the global constructors
are run. That caused the Singleton to be re-initialized, causing
it to create another MemoryManager instance.

Fixes #3226

											
										
										
											2020-08-25 01:35:19 +00:00
+								#include <AK/Singleton.h>
-												Kernel: Use timeval_sub for TCP retransmissions and lower timer to 500ms

											
										
										
											2019-09-08 09:40:26 +00:00
+								#include <AK/Time.h>
-												Meta: Split debug defines into multiple headers.

The following script was used to make these changes:

    #!/bin/bash
    set -e

    tmp=$(mktemp -d)

    echo "tmp=$tmp"

    find Kernel \( -name '*.cpp' -o -name '*.h' \) | sort > $tmp/Kernel.files
    find . \( -path ./Toolchain -prune -o -path ./Build -prune -o -path ./Kernel -prune \) -o \( -name '*.cpp' -o -name '*.h' \) -print | sort > $tmp/EverythingExceptKernel.files

    cat $tmp/Kernel.files | xargs grep -Eho '[A-Z0-9_]+_DEBUG' | sort | uniq > $tmp/Kernel.macros
    cat $tmp/EverythingExceptKernel.files | xargs grep -Eho '[A-Z0-9_]+_DEBUG' | sort | uniq > $tmp/EverythingExceptKernel.macros

    comm -23 $tmp/Kernel.macros $tmp/EverythingExceptKernel.macros > $tmp/Kernel.unique
    comm -1 $tmp/Kernel.macros $tmp/EverythingExceptKernel.macros > $tmp/EverythingExceptKernel.unique

    cat $tmp/Kernel.unique | awk '{ print "#cmakedefine01 "$1 }' > $tmp/Kernel.header
    cat $tmp/EverythingExceptKernel.unique | awk '{ print "#cmakedefine01 "$1 }' > $tmp/EverythingExceptKernel.header

    for macro in $(cat $tmp/Kernel.unique)
    do
        cat $tmp/Kernel.files | xargs grep -l $macro >> $tmp/Kernel.new-includes ||:
    done
    cat $tmp/Kernel.new-includes | sort > $tmp/Kernel.new-includes.sorted

    for macro in $(cat $tmp/EverythingExceptKernel.unique)
    do
        cat $tmp/Kernel.files | xargs grep -l $macro >> $tmp/Kernel.old-includes ||:
    done
    cat $tmp/Kernel.old-includes | sort > $tmp/Kernel.old-includes.sorted

    comm -23 $tmp/Kernel.new-includes.sorted $tmp/Kernel.old-includes.sorted > $tmp/Kernel.includes.new
    comm -13 $tmp/Kernel.new-includes.sorted $tmp/Kernel.old-includes.sorted > $tmp/Kernel.includes.old
    comm -12 $tmp/Kernel.new-includes.sorted $tmp/Kernel.old-includes.sorted > $tmp/Kernel.includes.mixed

    for file in $(cat $tmp/Kernel.includes.new)
    do
        sed -i -E 's/#include <AK\/Debug\.h>/#include <Kernel\/Debug\.h>/' $file
    done

    for file in $(cat $tmp/Kernel.includes.mixed)
    do
        echo "mixed include in $file, requires manual editing."
    done

											
										
										
											2021-01-25 15:07:10 +00:00
+								#include <Kernel/Debug.h>
-												Kernel: Move a bunch of generic devices code into new subdirectory

											
										
										
											2023-03-18 11:17:13 +00:00
+								#include <Kernel/Devices/Generic/RandomDevice.h>
-												Kernel: Rename FileDescription => OpenFileDescription

Dr. POSIX really calls these "open file description", not just
"file description", so let's call them exactly that. :^)

											
										
										
											2021-09-07 11:39:11 +00:00
+								#include <Kernel/FileSystem/OpenFileDescription.h>
-												Kernel: Rename ProtectedValue<T> => MutexProtected<T>

Let's make it obvious what we're protecting it with.

											
										
										
											2021-08-21 21:31:15 +00:00
+								#include <Kernel/Locking/MutexProtected.h>
-												Kernel: Don't try to send TCP packets larger than the MSS

Previously TCPSocket::send_tcp_packet() would try to send TCP packets
which matched whatever size the userspace program specified. We'd try to
break those packets up into smaller fragments, however a much better
approach is to limit TCP packets to the maximum segment size and
avoid fragmentation altogether.

											
										
										
											2021-05-25 19:29:37 +00:00
+								#include <Kernel/Net/EthernetFrameHeader.h>
-												Kernel: Set MSS option for outbound TCP SYN packets

When the MSS option header is missing the default maximum segment
size is 536 which results in lots of very small TCP packets that
NetworkTask has to handle.

This adds the MSS option header to outbound TCP SYN packets and
sets it to an appropriate value depending on the interface's MTU.

Note that we do not currently do path MTU discovery so this could
cause problems when hops don't fragment packets properly.

											
										
										
											2021-05-11 19:09:11 +00:00
+								#include <Kernel/Net/IPv4.h>
-												Kernel: Move networking related files into Kernel/Net/.

											
										
										
											2019-04-02 17:54:38 +00:00
+								#include <Kernel/Net/NetworkAdapter.h>
-												Kernel: Introduce the NetworkingManagement singleton

Instead of initializing network adapters in init.cpp, let's move that
logic into a separate class to handle this.
Also, it seems like a good idea to shift responsiblity on enumeration
of network adapters after the boot process, so this singleton will take
care of finding the appropriate network adapter when asked to with an
IPv4 address or interface name.

With this change being merged, we simplify the creation logic of
NetworkAdapter derived classes, so we enumerate the PCI bus only once,
searching for driver candidates when doing so, and we let each driver
to test if it is resposible for the specified PCI device.

											
										
										
											2021-06-04 04:43:16 +00:00
+								#include <Kernel/Net/NetworkingManagement.h>
-												Kernel: Add a LoopbackAdapter for talking to yourself via 127.0.0.1.

Choosing adapter for transmit is done by adapter_for_route_to(IPv4Address).
This is just hard-coded logic right now but can be expanded to support a
proper routing table.

Also start moving kernel networking code into Kernel/Net/.

											
										
										
											2019-04-02 13:46:44 +00:00
+								#include <Kernel/Net/Routing.h>
-												Kernel: Run clang-format on everything.

											
										
										
											2019-06-07 09:43:58 +00:00
+								#include <Kernel/Net/TCP.h>
 								#include <Kernel/Net/TCPSocket.h>
-												Kernel: Move Random.{h,cpp} code to Security subdirectory

											
										
										
											2023-02-24 17:49:37 +00:00
+								#include <Kernel/Security/Random.h>
-												Kernel: Move all tasks-related code to the Tasks subdirectory

											
										
										
											2023-02-24 17:45:37 +00:00
+								#include <Kernel/Tasks/Process.h>
-												Kernel/Net: Use monotonic time for TCP times

These were using real time as a mistake before; changing the system time
during ongoing TCP connections shouldn’t break them.

											
										
										
											2023-08-17 17:20:42 +00:00
+								#include <Kernel/Time/TimeManagement.h>
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
-												Kernel: Move all code into the Kernel namespace

											
										
										
											2020-02-16 00:27:42 +00:00
+								namespace Kernel {
-												Everywhere: Run clang-format

											
										
										
											2022-04-01 17:58:27 +00:00
+								void TCPSocket::for_each(Function<void(TCPSocket const&)> callback)
-												Kernel: Refactor TCP/IP stack

This has several significant changes to the networking stack.

* Significant refactoring of the TCP state machine. Right now it's
  probably more fragile than it used to be, but handles quite a lot
  more of the handshake process.
* `TCPSocket` holds a `NetworkAdapter*`, assigned during `connect()` or
  `bind()`, whichever comes first.
* `listen()` is now virtual in `Socket` and intended to be implemented
  in its child classes
* `listen()` no longer works without `bind()` - this is a bit of a
  regression, but listening sockets didn't work at all before, so it's
  not possible to observe the regression.
* A file is exposed at `/proc/net_tcp`, which is a JSON document listing
  the current TCP sockets with a bit of metadata.
* There's an `ETHERNET_VERY_DEBUG` flag for dumping packet's content out
  to `kprintf`. It is, indeed, _very debug_.

											
										
										
											2019-08-06 13:40:38 +00:00
+								{
-												Everywhere: Run clang-format

											
										
										
											2022-04-01 17:58:27 +00:00
+								    sockets_by_tuple().for_each_shared([&](auto const& it) {
-												Kernel: Make TCPSocket::for_each() callback accept a reference

Yay for less arrows!

											
										
										
											2019-08-09 10:26:29 +00:00
+								        callback(*it.value);
-												Kernel: Migrate TCP socket tables locking to ProtectedValue

Note: TCPSocket::create_client() has a dubious locking process where
the sockets by tuple table is first shared lock to check if the socket
exists and bail out if it does, then unlocks, then exclusively locks to
add the tuple. There could be a race condition where two client
creation requests for the same tuple happen at the same time and both
cleared the shared lock check. When in doubt, lock exclusively the
whole time.

											
										
										
											2021-07-18 10:14:43 +00:00
+								    });
-												Kernel: Refactor TCP/IP stack

This has several significant changes to the networking stack.

* Significant refactoring of the TCP state machine. Right now it's
  probably more fragile than it used to be, but handles quite a lot
  more of the handshake process.
* `TCPSocket` holds a `NetworkAdapter*`, assigned during `connect()` or
  `bind()`, whichever comes first.
* `listen()` is now virtual in `Socket` and intended to be implemented
  in its child classes
* `listen()` no longer works without `bind()` - this is a bit of a
  regression, but listening sockets didn't work at all before, so it's
  not possible to observe the regression.
* A file is exposed at `/proc/net_tcp`, which is a JSON document listing
  the current TCP sockets with a bit of metadata.
* There's an `ETHERNET_VERY_DEBUG` flag for dumping packet's content out
  to `kprintf`. It is, indeed, _very debug_.

											
										
										
											2019-08-06 13:40:38 +00:00
+								}
-												Everywhere: Run clang-format

											
										
										
											2022-04-01 17:58:27 +00:00
+								ErrorOr<void> TCPSocket::try_for_each(Function<ErrorOr<void>(TCPSocket const&)> callback)
-												Kernel: Add TCPSocket::try_for_each() for fallible iteration

This API will allow users to short circuit iteration and properly
propagate errors.

											
										
										
											2022-02-24 18:05:24 +00:00
+								{
-												Everywhere: Run clang-format

											
										
										
											2022-04-01 17:58:27 +00:00
+								    return sockets_by_tuple().with_shared([&](auto const& sockets) -> ErrorOr<void> {
-												Kernel: Add TCPSocket::try_for_each() for fallible iteration

This API will allow users to short circuit iteration and properly
propagate errors.

											
										
										
											2022-02-24 18:05:24 +00:00
+								        for (auto& it : sockets)
 								            TRY(callback(*it.value));
 								        return {};
 								    });
 								}
-												Kernel: Lock TCPSocket lookup table across destruction

Use the same trick as SlavePTY and override unref() to provide safe
removal from the sockets_by_tuple table when destroying a TCPSocket.

This should fix the TCPSocket::from_tuple() flake seen on CI.

											
										
										
											2022-01-06 21:06:00 +00:00
+								bool TCPSocket::unref() const
 								{
 								    bool did_hit_zero = sockets_by_tuple().with_exclusive([&](auto& table) {
 								        if (deref_base())
 								            return false;
 								        table.remove(tuple());
-												Kernel: Lock weak pointer revocation during listed-ref-counted unref

When doing the last unref() on a listed-ref-counted object, we keep
the list locked while mutating the ref count. The destructor itself
is invoked after unlocking the list.

This was racy with weakable classes, since their weak pointer factory
still pointed to the object after we'd decided to destroy it. That
opened a small time window where someone could try to strong-ref a weak
pointer to an object after it was removed from the list, but just before
the destructor got invoked.

This patch closes the race window by explicitly revoking all weak
pointers while the list is locked.

											
										
										
											2022-01-08 14:43:56 +00:00
+								        const_cast<TCPSocket&>(*this).revoke_weak_ptrs();
-												Kernel: Lock TCPSocket lookup table across destruction

Use the same trick as SlavePTY and override unref() to provide safe
removal from the sockets_by_tuple table when destroying a TCPSocket.

This should fix the TCPSocket::from_tuple() flake seen on CI.

											
										
										
											2022-01-06 21:06:00 +00:00
+								        return true;
 								    });
 								    if (did_hit_zero) {
 								        const_cast<TCPSocket&>(*this).will_be_destroyed();
 								        delete this;
 								    }
 								    return did_hit_zero;
 								}
-												Kernel: Move TCP state logging into TCPSocket

											
										
										
											2019-08-10 03:14:00 +00:00
+								void TCPSocket::set_state(State new_state)
 								{
-												Everywhere: Replace dbgln<flag>(...) with dbgln_if(flag, ...)

Replacement made by `find Kernel Userland -name '*.h' -o -name '*.cpp' | sed -i -Ee 's/dbgln\b<(\w+)>\(/dbgln_if(\1, /g'`

											
										
										
											2021-02-07 12:03:24 +00:00
+								    dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket({}) state moving from {} to {}", this, to_string(m_state), to_string(new_state));
-												Kernel: Move TCP state logging into TCPSocket

											
										
										
											2019-08-10 03:14:00 +00:00
-												Kernel: Move block condition evaluation out of the Scheduler

This makes the Scheduler a lot leaner by not having to evaluate
block conditions every time it is invoked. Instead evaluate them as
the states change, and unblock threads at that point.

This also implements some more waitid/waitpid/wait features and
behavior. For example, WUNTRACED and WNOWAIT are now supported. And
wait will now not return EINTR when SIGCHLD is delivered at the
same time.

											
										
										
											2020-11-29 23:05:27 +00:00
+								    auto was_disconnected = protocol_is_disconnected();
 								    auto previous_role = m_role;
-												Kernel: Move TCP state logging into TCPSocket

											
										
										
											2019-08-10 03:14:00 +00:00
+								    m_state = new_state;
-												Kernel: Move socket role tracking to the Socket class itself

This is more logical and allows us to solve the problem of
non-blocking TCP sockets getting stuck in SocketRole::None.

The only complication is that a single LocalSocket may be shared
between two file descriptions (on the connect and accept sides),
and should have two different roles depending from which side
you look at it. To deal with it, Socket::role() is made a
virtual method that accepts a file description, and LocalSocket
internally tracks which FileDescription is the which one and
returns a correct role.

											
										
										
											2019-08-11 13:38:20 +00:00
-												Kernel: Clear SO_ERROR on successful socket connection

When TCP sockets successfully establish a connection, any SO_ERROR
should be cleared back to success. For example, SO_ERROR gets set to
EINPROGRESS on asynchronous connect calls and should be cleared when
the socket moves to the Established state.

											
										
										
											2021-08-12 02:49:18 +00:00
+								    if (new_state == State::Established && m_direction == Direction::Outgoing) {
-												Kernel: Add Socket::set_role() and use it everywhere

Instead of having Socket subclasses write their role into Socket::m_role
directly, add a setter to do this.

											
										
										
											2021-08-29 00:04:30 +00:00
+								        set_role(Role::Connected);
-												Kernel: Replace KResult and KResultOr<T> with Error and ErrorOr<T>

We now use AK::Error and AK::ErrorOr<T> in both kernel and userspace!
This was a slightly tedious refactoring that took a long time, so it's
not unlikely that some bugs crept in.

Nevertheless, it does pass basic functionality testing, and it's just
real nice to finally see the same pattern in all contexts. :^)

											
										
										
											2021-11-07 23:51:39 +00:00
+								        clear_so_error();
-												Kernel: Clear SO_ERROR on successful socket connection

When TCP sockets successfully establish a connection, any SO_ERROR
should be cleared back to success. For example, SO_ERROR gets set to
EINPROGRESS on asynchronous connect calls and should be cleared when
the socket moves to the Established state.

											
										
										
											2021-08-12 02:49:18 +00:00
+								    }
-												IPv4: Basic implementation of TCP socket shutdown

We can now participate in the TCP connection closing handshake. :^)
This implementation is definitely not complete and needs to handle a
bunch of other cases. But it's a huge improvement over not being able
to close connections at all.

Note that we hold on to pending-close sockets indefinitely, until they
are moved into the Closed state. This should also have a timeout but
that's still a FIXME. :^)

Fixes #428.

											
										
										
											2020-02-08 14:52:32 +00:00
-												Kernel: Drop the receive buffer when socket enters the TimeWait state

The TimeWait state is intended to prevent another socket from taking the
address tuple in case any packets are still in transit after the final
close. Since this state never delivers packets to userspace, it doesn't
make sense to keep the receive buffer around.

											
										
										
											2021-09-16 00:15:36 +00:00
+								    if (new_state == State::TimeWait) {
 								        // Once we hit TimeWait, we are only holding the socket in case there
 								        // are packets on the way which we wouldn't want a new socket to get hit
 								        // with, so there's no point in keeping the receive buffer around.
 								        drop_receive_buffer();
 								    }
-												IPv4: Basic implementation of TCP socket shutdown

We can now participate in the TCP connection closing handshake. :^)
This implementation is definitely not complete and needs to handle a
bunch of other cases. But it's a huge improvement over not being able
to close connections at all.

Note that we hold on to pending-close sockets indefinitely, until they
are moved into the Closed state. This should also have a timeout but
that's still a FIXME. :^)

Fixes #428.

											
										
										
											2020-02-08 14:52:32 +00:00
+								    if (new_state == State::Closed) {
-												Kernel: Migrate TCP socket tables locking to ProtectedValue

Note: TCPSocket::create_client() has a dubious locking process where
the sockets by tuple table is first shared lock to check if the socket
exists and bail out if it does, then unlocks, then exclusively locks to
add the tuple. There could be a race condition where two client
creation requests for the same tuple happen at the same time and both
cleared the shared lock check. When in doubt, lock exclusively the
whole time.

											
										
										
											2021-07-18 10:14:43 +00:00
+								        closing_sockets().with_exclusive([&](auto& table) {
 								            table.remove(tuple());
 								        });
-												Kernel: Remove socket from the listener's accept list when it is closed

Without this patch we end up with sockets in the listener's accept
queue with state 'closed' when doing stealth SYN scans:

Client -> Server: SYN for port 22
Server -> Client: SYN/ACK
Client -> Server: RST (i.e. don't complete the TCP handshake)

											
										
										
											2021-04-30 19:43:37 +00:00
 								        if (m_originator)
 								            release_to_originator();
-												IPv4: Basic implementation of TCP socket shutdown

We can now participate in the TCP connection closing handshake. :^)
This implementation is definitely not complete and needs to handle a
bunch of other cases. But it's a huge improvement over not being able
to close connections at all.

Note that we hold on to pending-close sockets indefinitely, until they
are moved into the Closed state. This should also have a timeout but
that's still a FIXME. :^)

Fixes #428.

											
										
										
											2020-02-08 14:52:32 +00:00
+								    }
-												Kernel: Move block condition evaluation out of the Scheduler

This makes the Scheduler a lot leaner by not having to evaluate
block conditions every time it is invoked. Instead evaluate them as
the states change, and unblock threads at that point.

This also implements some more waitid/waitpid/wait features and
behavior. For example, WUNTRACED and WNOWAIT are now supported. And
wait will now not return EINTR when SIGCHLD is delivered at the
same time.

											
										
										
											2020-11-29 23:05:27 +00:00
 								    if (previous_role != m_role || was_disconnected != protocol_is_disconnected())
 								        evaluate_block_conditions();
-												IPv4: Basic implementation of TCP socket shutdown

We can now participate in the TCP connection closing handshake. :^)
This implementation is definitely not complete and needs to handle a
bunch of other cases. But it's a huge improvement over not being able
to close connections at all.

Note that we hold on to pending-close sockets indefinitely, until they
are moved into the Closed state. This should also have a timeout but
that's still a FIXME. :^)

Fixes #428.

											
										
										
											2020-02-08 14:52:32 +00:00
+								}
-												Kernel: Use RefPtr instead of LockRefPtr for File and subclasses

This was mostly straightforward, as all the storage locations are
guarded by some related mutex.

The use of old-school associated mutexes instead of MutexProtected
is unfortunate, but the process to modernize such code is ongoing.

											
										
										
											2023-03-10 06:53:02 +00:00
+								static Singleton<MutexProtected<HashMap<IPv4SocketTuple, RefPtr<TCPSocket>>>> s_socket_closing;
-												Kernel: Switch singletons to use new Singleton class

MemoryManager cannot use the Singleton class because
MemoryManager::initialize is called before the global constructors
are run. That caused the Singleton to be re-initialized, causing
it to create another MemoryManager instance.

Fixes #3226

											
										
										
											2020-08-25 01:35:19 +00:00
-												Kernel: Use RefPtr instead of LockRefPtr for File and subclasses

This was mostly straightforward, as all the storage locations are
guarded by some related mutex.

The use of old-school associated mutexes instead of MutexProtected
is unfortunate, but the process to modernize such code is ongoing.

											
										
										
											2023-03-10 06:53:02 +00:00
+								MutexProtected<HashMap<IPv4SocketTuple, RefPtr<TCPSocket>>>& TCPSocket::closing_sockets()
-												IPv4: Basic implementation of TCP socket shutdown

We can now participate in the TCP connection closing handshake. :^)
This implementation is definitely not complete and needs to handle a
bunch of other cases. But it's a huge improvement over not being able
to close connections at all.

Note that we hold on to pending-close sockets indefinitely, until they
are moved into the Closed state. This should also have a timeout but
that's still a FIXME. :^)

Fixes #428.

											
										
										
											2020-02-08 14:52:32 +00:00
+								{
-												Kernel: Switch singletons to use new Singleton class

MemoryManager cannot use the Singleton class because
MemoryManager::initialize is called before the global constructors
are run. That caused the Singleton to be re-initialized, causing
it to create another MemoryManager instance.

Fixes #3226

											
										
										
											2020-08-25 01:35:19 +00:00
+								    return *s_socket_closing;
-												Kernel: Move TCP state logging into TCPSocket

											
										
										
											2019-08-10 03:14:00 +00:00
+								}
-												Kernel: Rename ProtectedValue<T> => MutexProtected<T>

Let's make it obvious what we're protecting it with.

											
										
										
											2021-08-21 21:31:15 +00:00
+								static Singleton<MutexProtected<HashMap<IPv4SocketTuple, TCPSocket*>>> s_socket_tuples;
-												Kernel: Switch singletons to use new Singleton class

MemoryManager cannot use the Singleton class because
MemoryManager::initialize is called before the global constructors
are run. That caused the Singleton to be re-initialized, causing
it to create another MemoryManager instance.

Fixes #3226

											
										
										
											2020-08-25 01:35:19 +00:00
-												Kernel: Rename ProtectedValue<T> => MutexProtected<T>

Let's make it obvious what we're protecting it with.

											
										
										
											2021-08-21 21:31:15 +00:00
+								MutexProtected<HashMap<IPv4SocketTuple, TCPSocket*>>& TCPSocket::sockets_by_tuple()
-												IPv4: Move more stuff from IPv4Socket to TCPSocket.

											
										
										
											2019-03-14 11:28:30 +00:00
+								{
-												Kernel: Switch singletons to use new Singleton class

MemoryManager cannot use the Singleton class because
MemoryManager::initialize is called before the global constructors
are run. That caused the Singleton to be re-initialized, causing
it to create another MemoryManager instance.

Fixes #3226

											
										
										
											2020-08-25 01:35:19 +00:00
+								    return *s_socket_tuples;
-												IPv4: Move more stuff from IPv4Socket to TCPSocket.

											
										
										
											2019-03-14 11:28:30 +00:00
+								}
-												Kernel: Use RefPtr instead of LockRefPtr for File and subclasses

This was mostly straightforward, as all the storage locations are
guarded by some related mutex.

The use of old-school associated mutexes instead of MutexProtected
is unfortunate, but the process to modernize such code is ongoing.

											
										
										
											2023-03-10 06:53:02 +00:00
+								RefPtr<TCPSocket> TCPSocket::from_tuple(IPv4SocketTuple const& tuple)
-												IPv4: Move more stuff from IPv4Socket to TCPSocket.

											
										
										
											2019-03-14 11:28:30 +00:00
+								{
-												Kernel: Use RefPtr instead of LockRefPtr for File and subclasses

This was mostly straightforward, as all the storage locations are
guarded by some related mutex.

The use of old-school associated mutexes instead of MutexProtected
is unfortunate, but the process to modernize such code is ongoing.

											
										
										
											2023-03-10 06:53:02 +00:00
+								    return sockets_by_tuple().with_shared([&](auto const& table) -> RefPtr<TCPSocket> {
-												Kernel: Migrate TCP socket tables locking to ProtectedValue

Note: TCPSocket::create_client() has a dubious locking process where
the sockets by tuple table is first shared lock to check if the socket
exists and bail out if it does, then unlocks, then exclusively locks to
add the tuple. There could be a race condition where two client
creation requests for the same tuple happen at the same time and both
cleared the shared lock check. When in doubt, lock exclusively the
whole time.

											
										
										
											2021-07-18 10:14:43 +00:00
+								        auto exact_match = table.get(tuple);
 								        if (exact_match.has_value())
 								            return { *exact_match.value() };
-												Kernel: Implement TCP listening sockets and incoming connections

											
										
										
											2019-08-09 02:48:28 +00:00
-												Kernel: Migrate TCP socket tables locking to ProtectedValue

Note: TCPSocket::create_client() has a dubious locking process where
the sockets by tuple table is first shared lock to check if the socket
exists and bail out if it does, then unlocks, then exclusively locks to
add the tuple. There could be a race condition where two client
creation requests for the same tuple happen at the same time and both
cleared the shared lock check. When in doubt, lock exclusively the
whole time.

											
										
										
											2021-07-18 10:14:43 +00:00
+								        auto address_tuple = IPv4SocketTuple(tuple.local_address(), tuple.local_port(), IPv4Address(), 0);
 								        auto address_match = table.get(address_tuple);
 								        if (address_match.has_value())
 								            return { *address_match.value() };
-												Kernel: Implement TCP listening sockets and incoming connections

											
										
										
											2019-08-09 02:48:28 +00:00
-												Kernel: Migrate TCP socket tables locking to ProtectedValue

Note: TCPSocket::create_client() has a dubious locking process where
the sockets by tuple table is first shared lock to check if the socket
exists and bail out if it does, then unlocks, then exclusively locks to
add the tuple. There could be a race condition where two client
creation requests for the same tuple happen at the same time and both
cleared the shared lock check. When in doubt, lock exclusively the
whole time.

											
										
										
											2021-07-18 10:14:43 +00:00
+								        auto wildcard_tuple = IPv4SocketTuple(IPv4Address(), tuple.local_port(), IPv4Address(), 0);
 								        auto wildcard_match = table.get(wildcard_tuple);
 								        if (wildcard_match.has_value())
 								            return { *wildcard_match.value() };
-												Kernel: Implement TCP listening sockets and incoming connections

											
										
										
											2019-08-09 02:48:28 +00:00
-												Kernel: Migrate TCP socket tables locking to ProtectedValue

Note: TCPSocket::create_client() has a dubious locking process where
the sockets by tuple table is first shared lock to check if the socket
exists and bail out if it does, then unlocks, then exclusively locks to
add the tuple. There could be a race condition where two client
creation requests for the same tuple happen at the same time and both
cleared the shared lock check. When in doubt, lock exclusively the
whole time.

											
										
										
											2021-07-18 10:14:43 +00:00
+								        return {};
 								    });
-												IPv4: Move more stuff from IPv4Socket to TCPSocket.

											
										
										
											2019-03-14 11:28:30 +00:00
+								}
-												Kernel: Use RefPtr instead of LockRefPtr for File and subclasses

This was mostly straightforward, as all the storage locations are
guarded by some related mutex.

The use of old-school associated mutexes instead of MutexProtected
is unfortunate, but the process to modernize such code is ongoing.

											
										
										
											2023-03-10 06:53:02 +00:00
+								ErrorOr<NonnullRefPtr<TCPSocket>> TCPSocket::try_create_client(IPv4Address const& new_local_address, u16 new_local_port, IPv4Address const& new_peer_address, u16 new_peer_port)
-												Kernel: Implement TCP listening sockets and incoming connections

											
										
										
											2019-08-09 02:48:28 +00:00
+								{
 								    auto tuple = IPv4SocketTuple(new_local_address, new_local_port, new_peer_address, new_peer_port);
-												Kernel: Use RefPtr instead of LockRefPtr for File and subclasses

This was mostly straightforward, as all the storage locations are
guarded by some related mutex.

The use of old-school associated mutexes instead of MutexProtected
is unfortunate, but the process to modernize such code is ongoing.

											
										
										
											2023-03-10 06:53:02 +00:00
+								    return sockets_by_tuple().with_exclusive([&](auto& table) -> ErrorOr<NonnullRefPtr<TCPSocket>> {
-												Kernel: Migrate TCP socket tables locking to ProtectedValue

Note: TCPSocket::create_client() has a dubious locking process where
the sockets by tuple table is first shared lock to check if the socket
exists and bail out if it does, then unlocks, then exclusively locks to
add the tuple. There could be a race condition where two client
creation requests for the same tuple happen at the same time and both
cleared the shared lock check. When in doubt, lock exclusively the
whole time.

											
										
										
											2021-07-18 10:14:43 +00:00
+								        if (table.contains(tuple))
-												Kernel: Make TCPSocket client construction use KResultOr and TRY()

We don't really have anywhere to propagate the error in NetworkTask at
the moment, since it runs in its own kernel thread and has no direct
userspace caller.

											
										
										
											2021-09-07 12:44:29 +00:00
+								            return EEXIST;
-												Kernel: Implement TCP listening sockets and incoming connections

											
										
										
											2019-08-09 02:48:28 +00:00
-												Kernel: Make TCPSocket client construction use KResultOr and TRY()

We don't really have anywhere to propagate the error in NetworkTask at
the moment, since it runs in its own kernel thread and has no direct
userspace caller.

											
										
										
											2021-09-07 12:44:29 +00:00
+								        auto receive_buffer = TRY(try_create_receive_buffer());
 								        auto client = TRY(TCPSocket::try_create(protocol(), move(receive_buffer)));
-												Kernel: Implement TCP listening sockets and incoming connections

											
										
										
											2019-08-09 02:48:28 +00:00
-												Kernel: Migrate TCP socket tables locking to ProtectedValue

Note: TCPSocket::create_client() has a dubious locking process where
the sockets by tuple table is first shared lock to check if the socket
exists and bail out if it does, then unlocks, then exclusively locks to
add the tuple. There could be a race condition where two client
creation requests for the same tuple happen at the same time and both
cleared the shared lock check. When in doubt, lock exclusively the
whole time.

											
										
										
											2021-07-18 10:14:43 +00:00
+								        client->set_setup_state(SetupState::InProgress);
 								        client->set_local_address(new_local_address);
 								        client->set_local_port(new_local_port);
 								        client->set_peer_address(new_peer_address);
 								        client->set_peer_port(new_peer_port);
-												Kernel/Net: Rework ephemeral port allocation

Currently, ephemeral port allocation is handled by the
allocate_local_port_if_needed() and protocol_allocate_local_port()
methods. Actually binding the socket to an address (which means
inserting the socket/address pair into a global map) is performed either
in protocol_allocate_local_port() (for ephemeral ports) or in
protocol_listen() (for non-ephemeral ports); the latter will fail with
EADDRINUSE if the address is already used by an existing pair present in
the map.

There used to be a bug where for listen() without an explicit bind(),
the port allocation would conflict with itself: first an ephemeral port
would get allocated and inserted into the map, and then
protocol_listen() would check again for the port being free, find the
just-created map entry, and error out. This was fixed in commit
01e5af487f9513696dbcacab15d3e0036446f586 by passing an additional flag
did_allocate_port into protocol_listen() which specifies whether the
port was just allocated, and skipping the check in protocol_listen() if
the flag is set.

However, this only helps if the socket is bound to an ephemeral port
inside of this very listen() call. But calling bind(sin_port = 0) from
userspace should succeed and bind to an allocated ephemeral port, in the
same was as using an unbound socket for connect() does. The port number
can then be retrieved from userspace by calling getsockname (), and it
should be possible to either connect() or listen() on this socket,
keeping the allocated port number. Also, calling bind() when already
bound (either explicitly or implicitly) should always result in EINVAL.

To untangle this, introduce an explicit m_bound state in IPv4Socket,
just like LocalSocket has already. Once a socket is bound, further
attempt to bind it fail. Some operations cause the socket to implicitly
get bound to an (ephemeral) address; this is implemented by the new
ensure_bound() method. The protocol_allocate_local_port() method is
gone; it is now up to a protocol to assign a port to the socket inside
protocol_bind() if it finds that the socket has local_port() == 0.

protocol_bind() is now called in more cases, such as inside listen() if
the socket wasn't bound before that.

											
										
										
											2023-07-23 12:43:45 +00:00
+								        client->set_bound(true);
-												Kernel: Migrate TCP socket tables locking to ProtectedValue

Note: TCPSocket::create_client() has a dubious locking process where
the sockets by tuple table is first shared lock to check if the socket
exists and bail out if it does, then unlocks, then exclusively locks to
add the tuple. There could be a race condition where two client
creation requests for the same tuple happen at the same time and both
cleared the shared lock check. When in doubt, lock exclusively the
whole time.

											
										
										
											2021-07-18 10:14:43 +00:00
+								        client->set_direction(Direction::Incoming);
 								        client->set_originator(*this);
-												Kernel: Implement TCP listening sockets and incoming connections

											
										
										
											2019-08-09 02:48:28 +00:00
-												Kernel: Migrate TCP socket tables locking to ProtectedValue

Note: TCPSocket::create_client() has a dubious locking process where
the sockets by tuple table is first shared lock to check if the socket
exists and bail out if it does, then unlocks, then exclusively locks to
add the tuple. There could be a race condition where two client
creation requests for the same tuple happen at the same time and both
cleared the shared lock check. When in doubt, lock exclusively the
whole time.

											
										
										
											2021-07-18 10:14:43 +00:00
+								        m_pending_release_for_accept.set(tuple, client);
 								        table.set(tuple, client);
-												Kernel: Implement TCP listening sockets and incoming connections

											
										
										
											2019-08-09 02:48:28 +00:00
-												Kernel: Migrate TCP socket tables locking to ProtectedValue

Note: TCPSocket::create_client() has a dubious locking process where
the sockets by tuple table is first shared lock to check if the socket
exists and bail out if it does, then unlocks, then exclusively locks to
add the tuple. There could be a race condition where two client
creation requests for the same tuple happen at the same time and both
cleared the shared lock check. When in doubt, lock exclusively the
whole time.

											
										
										
											2021-07-18 10:14:43 +00:00
+								        return { move(client) };
 								    });
-												Kernel: Implement TCP listening sockets and incoming connections

											
										
										
											2019-08-09 02:48:28 +00:00
+								}
-												Kernel: Hold socket back from accept() until it's fully set up

											
										
										
											2019-09-08 07:18:28 +00:00
+								void TCPSocket::release_to_originator()
 								{
-												Everywhere: Rename ASSERT => VERIFY

(...and ASSERT_NOT_REACHED => VERIFY_NOT_REACHED)

Since all of these checks are done in release builds as well,
let's rename them to VERIFY to prevent confusion, as everyone is
used to assertions being compiled out in release.

We can introduce a new ASSERT macro that is specifically for debug
checks, but I'm doing this wholesale conversion first since we've
accumulated thousands of these already, and it's not immediately
obvious which ones are suitable for ASSERT.

											
										
										
											2021-02-23 19:42:32 +00:00
+								    VERIFY(!!m_originator);
-												Kernel: Accept NNRP<Socket> instead of RP<Socket> in release_for_accept

This value is always non-null, so let's make it explicit.

											
										
										
											2022-04-09 19:25:39 +00:00
+								    m_originator.strong_ref()->release_for_accept(*this);
-												Kernel: Remove socket from the listener's accept list when it is closed

Without this patch we end up with sockets in the listener's accept
queue with state 'closed' when doing stealth SYN scans:

Client -> Server: SYN for port 22
Server -> Client: SYN/ACK
Client -> Server: RST (i.e. don't complete the TCP handshake)

											
										
										
											2021-04-30 19:43:37 +00:00
+								    m_originator.clear();
-												Kernel: Hold socket back from accept() until it's fully set up

											
										
										
											2019-09-08 07:18:28 +00:00
+								}
-												Kernel: Use RefPtr instead of LockRefPtr for File and subclasses

This was mostly straightforward, as all the storage locations are
guarded by some related mutex.

The use of old-school associated mutexes instead of MutexProtected
is unfortunate, but the process to modernize such code is ongoing.

											
										
										
											2023-03-10 06:53:02 +00:00
+								void TCPSocket::release_for_accept(NonnullRefPtr<TCPSocket> socket)
-												Kernel: Hold socket back from accept() until it's fully set up

											
										
										
											2019-09-08 07:18:28 +00:00
+								{
-												Everywhere: Rename ASSERT => VERIFY

(...and ASSERT_NOT_REACHED => VERIFY_NOT_REACHED)

Since all of these checks are done in release builds as well,
let's rename them to VERIFY to prevent confusion, as everyone is
used to assertions being compiled out in release.

We can introduce a new ASSERT macro that is specifically for debug
checks, but I'm doing this wholesale conversion first since we've
accumulated thousands of these already, and it's not immediately
obvious which ones are suitable for ASSERT.

											
										
										
											2021-02-23 19:42:32 +00:00
+								    VERIFY(m_pending_release_for_accept.contains(socket->tuple()));
-												Kernel: Hold socket back from accept() until it's fully set up

											
										
										
											2019-09-08 07:18:28 +00:00
+								    m_pending_release_for_accept.remove(socket->tuple());
-												Kernel: Suppress remaining unobserved KResult return codes

These are all cases where there is no clear and easy fix,
I've left FIXME bread crumbs so that these can hopefully
be fixed over time.

											
										
										
											2020-08-05 09:13:30 +00:00
+								    // FIXME: Should we observe this error somehow?
-												Kernel: Accept NNRP<Socket> instead of RP<Socket> in release_for_accept

This value is always non-null, so let's make it explicit.

											
										
										
											2022-04-09 19:25:39 +00:00
+								    [[maybe_unused]] auto rc = queue_connection_from(move(socket));
-												Kernel: Hold socket back from accept() until it's fully set up

											
										
										
											2019-09-08 07:18:28 +00:00
+								}
-												Kernel: TCPSocket always has a scratch buffer

Let's encode this in the constructor signature.

											
										
										
											2021-09-07 13:11:49 +00:00
+								TCPSocket::TCPSocket(int protocol, NonnullOwnPtr<DoubleBuffer> receive_buffer, NonnullOwnPtr<KBuffer> scratch_buffer)
-												Kernel: Handle OOM when allocating IPv4Socket optional scratch buffer

											
										
										
											2021-08-01 12:11:05 +00:00
+								    : IPv4Socket(SOCK_STREAM, protocol, move(receive_buffer), move(scratch_buffer))
-												Kernel/Net: Use monotonic time for TCP times

These were using real time as a mistake before; changing the system time
during ongoing TCP connections shouldn’t break them.

											
										
										
											2023-08-17 17:20:42 +00:00
+								    , m_last_ack_sent_time(TimeManagement::the().monotonic_time())
 								    , m_last_retransmit_time(TimeManagement::the().monotonic_time())
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								{
 								}
 								TCPSocket::~TCPSocket()
 								{
-												Kernel: Try to retransmit lost TCP packets

Previously we didn't retransmit lost TCP packets which would cause
connections to hang if packets were lost. Also we now time out
TCP connections after a number of retransmission attempts.

											
										
										
											2021-05-13 08:49:10 +00:00
+								    dequeue_for_retransmit();
-												Everywhere: Replace dbgln<flag>(...) with dbgln_if(flag, ...)

Replacement made by `find Kernel Userland -name '*.h' -o -name '*.cpp' | sed -i -Ee 's/dbgln\b<(\w+)>\(/dbgln_if(\1, /g'`

											
										
										
											2021-02-07 12:03:24 +00:00
+								    dbgln_if(TCP_SOCKET_DEBUG, "~TCPSocket in state {}", to_string(state()));
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								}
-												Kernel: Use RefPtr instead of LockRefPtr for File and subclasses

This was mostly straightforward, as all the storage locations are
guarded by some related mutex.

The use of old-school associated mutexes instead of MutexProtected
is unfortunate, but the process to modernize such code is ongoing.

											
										
										
											2023-03-10 06:53:02 +00:00
+								ErrorOr<NonnullRefPtr<TCPSocket>> TCPSocket::try_create(int protocol, NonnullOwnPtr<DoubleBuffer> receive_buffer)
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								{
-												Kernel: Handle OOM when allocating IPv4Socket optional scratch buffer

											
										
										
											2021-08-01 12:11:05 +00:00
+								    // Note: Scratch buffer is only used for SOCK_STREAM sockets.
-												Kernel: Annotate all `KBuffer` and `DoubleBuffer` with a custom name

											
										
										
											2022-04-10 22:08:07 +00:00
+								    auto scratch_buffer = TRY(KBuffer::try_create_with_size("TCPSocket: Scratch buffer"sv, 65536));
-												Kernel: Use RefPtr instead of LockRefPtr for File and subclasses

This was mostly straightforward, as all the storage locations are
guarded by some related mutex.

The use of old-school associated mutexes instead of MutexProtected
is unfortunate, but the process to modernize such code is ongoing.

											
										
										
											2023-03-10 06:53:02 +00:00
+								    return adopt_nonnull_ref_or_enomem(new (nothrow) TCPSocket(protocol, move(receive_buffer), move(scratch_buffer)));
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								}
-												Kernel: Return the correct result for FIONREAD on datagram sockets

Before this commit, we only checked the receive buffer on the socket,
which is unused on datagram streams. Now we return the actual size of
the datagram without the protocol headers, which required the protocol
to tell us what the size of the payload is.

											
										
										
											2021-09-11 19:38:05 +00:00
+								ErrorOr<size_t> TCPSocket::protocol_size(ReadonlyBytes raw_ipv4_packet)
 								{
-												Everywhere: Run clang-format

											
										
										
											2022-04-01 17:58:27 +00:00
+								    auto& ipv4_packet = *reinterpret_cast<IPv4Packet const*>(raw_ipv4_packet.data());
 								    auto& tcp_packet = *static_cast<TCPPacket const*>(ipv4_packet.payload());
-												Kernel: Return the correct result for FIONREAD on datagram sockets

Before this commit, we only checked the receive buffer on the socket,
which is unused on datagram streams. Now we return the actual size of
the datagram without the protocol headers, which required the protocol
to tell us what the size of the payload is.

											
										
										
											2021-09-11 19:38:05 +00:00
+								    return raw_ipv4_packet.size() - sizeof(IPv4Packet) - tcp_packet.header_size();
 								}
-												Kernel: Replace KResult and KResultOr<T> with Error and ErrorOr<T>

We now use AK::Error and AK::ErrorOr<T> in both kernel and userspace!
This was a slightly tedious refactoring that took a long time, so it's
not unlikely that some bugs crept in.

Nevertheless, it does pass basic functionality testing, and it's just
real nice to finally see the same pattern in all contexts. :^)

											
										
										
											2021-11-07 23:51:39 +00:00
+								ErrorOr<size_t> TCPSocket::protocol_receive(ReadonlyBytes raw_ipv4_packet, UserOrKernelBuffer& buffer, size_t buffer_size, [[maybe_unused]] int flags)
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								{
-												Everywhere: Run clang-format

											
										
										
											2022-04-01 17:58:27 +00:00
+								    auto& ipv4_packet = *reinterpret_cast<IPv4Packet const*>(raw_ipv4_packet.data());
 								    auto& tcp_packet = *static_cast<TCPPacket const*>(ipv4_packet.payload());
-												Kernel/Net: Make IPv4Socket::protocol_receive() take a ReadonlyBytes

The overrides of this function don't need to know how the original
packet was stored, so let's just give them a ReadonlyBytes view of
the raw packet data.

											
										
										
											2020-12-18 15:13:23 +00:00
+								    size_t payload_size = raw_ipv4_packet.size() - sizeof(IPv4Packet) - tcp_packet.header_size();
-												Kernel: Convert klog() => dmesgln() in TCPSocket

											
										
										
											2021-03-09 22:06:47 +00:00
+								    dbgln_if(TCP_SOCKET_DEBUG, "payload_size {}, will it fit in {}?", payload_size, buffer_size);
-												Everywhere: Rename ASSERT => VERIFY

(...and ASSERT_NOT_REACHED => VERIFY_NOT_REACHED)

Since all of these checks are done in release builds as well,
let's rename them to VERIFY to prevent confusion, as everyone is
used to assertions being compiled out in release.

We can introduce a new ASSERT macro that is specifically for debug
checks, but I'm doing this wholesale conversion first since we've
accumulated thousands of these already, and it's not immediately
obvious which ones are suitable for ASSERT.

											
										
										
											2021-02-23 19:42:32 +00:00
+								    VERIFY(buffer_size >= payload_size);
-												Kernel/Net: Add a special SOCKET_TRY() and use it in socket code

Sockets remember their last error code in the SO_ERROR field, so we need
to take special care to remember this when returning an error.

This patch adds a SOCKET_TRY() that works like TRY() but also calls
set_so_error() on the failure path.

There's probably a lot more code that should be using this, but that's
outside the scope of this patch.

											
										
										
											2021-09-07 13:05:51 +00:00
+								    SOCKET_TRY(buffer.write(tcp_packet.payload(), payload_size));
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								    return payload_size;
 								}
-												Everywhere: Run clang-format

											
										
										
											2022-04-01 17:58:27 +00:00
+								ErrorOr<size_t> TCPSocket::protocol_send(UserOrKernelBuffer const& data, size_t data_length)
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								{
-												Kernel/Net: Iron out the locking mechanism across the subsystem

There is a big mix of LockRefPtrs all over the Networking subsystem, as
well as lots of room for improvements with our locking patterns, which
this commit will not pursue, but will give a good start for such work.

To deal with this situation, we change the following things:
- Creating instances of NetworkAdapter should always yield a non-locking
  NonnullRefPtr. Acquiring an instance from the NetworkingManagement
  should give a simple RefPtr,as giving LockRefPtr does not really
  protect from concurrency problems in such case.
- Since NetworkingManagement works with normal RefPtrs we should
  protect all instances of RefPtr<NetworkAdapter> with SpinlockProtected
  to ensure references are gone unexpectedly.
- Protect the so_error class member with a proper spinlock. This happens
  to be important because the clear_so_error() method lacked any proper
  locking measures. It also helps preventing a possible TOCTOU when we
  might do a more fine-grained locking in the Socket code, so this could
  be definitely a start for this.
- Change unnecessary LockRefPtr<PacketWithTimestamp> in the structure
  of OutgoingPacket to a simple RefPtr<PacketWithTimestamp> as the whole
  list should be MutexProtected.

											
										
										
											2023-04-11 00:50:15 +00:00
+								    auto adapter = bound_interface().with([](auto& bound_device) -> RefPtr<NetworkAdapter> { return bound_device; });
 								    RoutingDecision routing_decision = route_to(peer_address(), local_address(), adapter);
-												Kernel: Don't try to send TCP packets larger than the MSS

Previously TCPSocket::send_tcp_packet() would try to send TCP packets
which matched whatever size the userspace program specified. We'd try to
break those packets up into smaller fragments, however a much better
approach is to limit TCP packets to the maximum segment size and
avoid fragmentation altogether.

											
										
										
											2021-05-25 19:29:37 +00:00
+								    if (routing_decision.is_zero())
-												Kernel: Add so_error to keep track of the socket's error state

This sets the m_so_error variable every time the socket returns an
error.

											
										
										
											2021-08-01 15:27:23 +00:00
+								        return set_so_error(EHOSTUNREACH);
-												Kernel: Don't try to send TCP packets larger than the MSS

Previously TCPSocket::send_tcp_packet() would try to send TCP packets
which matched whatever size the userspace program specified. We'd try to
break those packets up into smaller fragments, however a much better
approach is to limit TCP packets to the maximum segment size and
avoid fragmentation altogether.

											
										
										
											2021-05-25 19:29:37 +00:00
+								    size_t mss = routing_decision.adapter->mtu() - sizeof(IPv4Packet) - sizeof(TCPPacket);
-												Kernel: Implement Nagle’s Algorithm

This is an initial implementation, about as basic as intended by the
RFC, and not configurable from userspace at the moment. It should reduce
the amount of low-sized packets sent, reducing overhead and thereby
network traffic.

											
										
										
											2023-08-17 17:53:49 +00:00
 								    // RFC 896 (Nagle’s algorithm): https://www.ietf.org/rfc/rfc0896
 								    // "The solution is to inhibit the sending of new TCP  segments when
 								    //  new  outgoing  data  arrives  from  the  user  if  any previously
 								    //  transmitted data on the connection remains unacknowledged.   This
 								    //  inhibition  is  to be unconditional; no timers, tests for size of
 								    //  data received, or other conditions are required."
 								    // FIXME: Make this configurable via TCP_NODELAY.
 								    auto has_unacked_data = m_unacked_packets.with_shared([&](auto const& packets) { return packets.size > 0; });
 								    if (has_unacked_data && data_length < mss)
 								        return 0;
-												Kernel: Don't try to send TCP packets larger than the MSS

Previously TCPSocket::send_tcp_packet() would try to send TCP packets
which matched whatever size the userspace program specified. We'd try to
break those packets up into smaller fragments, however a much better
approach is to limit TCP packets to the maximum segment size and
avoid fragmentation altogether.

											
										
										
											2021-05-25 19:29:37 +00:00
+								    data_length = min(data_length, mss);
-												Kernel: Rename TCPFlags::PUSH => PSH

Let's use the proper name of this TCP flag.

											
										
										
											2022-03-17 13:24:21 +00:00
+								    TRY(send_tcp_packet(TCPFlags::PSH | TCPFlags::ACK, &data, data_length, &routing_decision));
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								    return data_length;
 								}
-												Kernel: Replace KResult and KResultOr<T> with Error and ErrorOr<T>

We now use AK::Error and AK::ErrorOr<T> in both kernel and userspace!
This was a slightly tedious refactoring that took a long time, so it's
not unlikely that some bugs crept in.

Nevertheless, it does pass basic functionality testing, and it's just
real nice to finally see the same pattern in all contexts. :^)

											
										
										
											2021-11-07 23:51:39 +00:00
+								ErrorOr<void> TCPSocket::send_ack(bool allow_duplicate)
-												Kernel: Coalesce TCP ACKs

Previously we'd send a TCP ACK for each TCP packet we received. This
changes NetworkTask so that we send fewer TCP ACKs.

											
										
										
											2021-05-12 07:14:37 +00:00
+								{
 								    if (!allow_duplicate && m_last_ack_number_sent == m_ack_number)
-												Kernel: Replace KResult and KResultOr<T> with Error and ErrorOr<T>

We now use AK::Error and AK::ErrorOr<T> in both kernel and userspace!
This was a slightly tedious refactoring that took a long time, so it's
not unlikely that some bugs crept in.

Nevertheless, it does pass basic functionality testing, and it's just
real nice to finally see the same pattern in all contexts. :^)

											
										
										
											2021-11-07 23:51:39 +00:00
+								        return {};
-												Kernel: Coalesce TCP ACKs

Previously we'd send a TCP ACK for each TCP packet we received. This
changes NetworkTask so that we send fewer TCP ACKs.

											
										
										
											2021-05-12 07:14:37 +00:00
+								    return send_tcp_packet(TCPFlags::ACK);
 								}
-												Everywhere: Run clang-format

											
										
										
											2022-04-01 17:58:27 +00:00
+								ErrorOr<void> TCPSocket::send_tcp_packet(u16 flags, UserOrKernelBuffer const* payload, size_t payload_size, RoutingDecision* user_routing_decision)
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								{
-												Kernel/Net: Iron out the locking mechanism across the subsystem

There is a big mix of LockRefPtrs all over the Networking subsystem, as
well as lots of room for improvements with our locking patterns, which
this commit will not pursue, but will give a good start for such work.

To deal with this situation, we change the following things:
- Creating instances of NetworkAdapter should always yield a non-locking
  NonnullRefPtr. Acquiring an instance from the NetworkingManagement
  should give a simple RefPtr,as giving LockRefPtr does not really
  protect from concurrency problems in such case.
- Since NetworkingManagement works with normal RefPtrs we should
  protect all instances of RefPtr<NetworkAdapter> with SpinlockProtected
  to ensure references are gone unexpectedly.
- Protect the so_error class member with a proper spinlock. This happens
  to be important because the clear_so_error() method lacked any proper
  locking measures. It also helps preventing a possible TOCTOU when we
  might do a more fine-grained locking in the Socket code, so this could
  be definitely a start for this.
- Change unnecessary LockRefPtr<PacketWithTimestamp> in the structure
  of OutgoingPacket to a simple RefPtr<PacketWithTimestamp> as the whole
  list should be MutexProtected.

											
										
										
											2023-04-11 00:50:15 +00:00
+								    auto adapter = bound_interface().with([](auto& bound_device) -> RefPtr<NetworkAdapter> { return bound_device; });
 								    RoutingDecision routing_decision = user_routing_decision ? *user_routing_decision : route_to(peer_address(), local_address(), adapter);
-												Kernel: Avoid allocations when sending IP packets

Previously we'd allocate buffers when sending packets. This patch
avoids these allocations by using the NetworkAdapter's packet queue.

At the same time this also avoids copying partially constructed
packets in order to prepend Ethernet and/or IPv4 headers. It also
properly truncates UDP and raw IP packets.

											
										
										
											2021-05-26 03:35:05 +00:00
+								    if (routing_decision.is_zero())
-												Kernel: Add so_error to keep track of the socket's error state

This sets the m_so_error variable every time the socket returns an
error.

											
										
										
											2021-08-01 15:27:23 +00:00
+								        return set_so_error(EHOSTUNREACH);
-												Kernel: Avoid allocations when sending IP packets

Previously we'd allocate buffers when sending packets. This patch
avoids these allocations by using the NetworkAdapter's packet queue.

At the same time this also avoids copying partially constructed
packets in order to prepend Ethernet and/or IPv4 headers. It also
properly truncates UDP and raw IP packets.

											
										
										
											2021-05-26 03:35:05 +00:00
 								    auto ipv4_payload_offset = routing_decision.adapter->ipv4_payload_offset();
-												Everywhere: Run clang-format

											
										
										
											2022-04-01 17:58:27 +00:00
+								    bool const has_mss_option = flags == TCPFlags::SYN;
-												Kernel: Set MSS option for outbound TCP SYN packets

When the MSS option header is missing the default maximum segment
size is 536 which results in lots of very small TCP packets that
NetworkTask has to handle.

This adds the MSS option header to outbound TCP SYN packets and
sets it to an appropriate value depending on the interface's MTU.

Note that we do not currently do path MTU discovery so this could
cause problems when hops don't fragment packets properly.

											
										
										
											2021-05-11 19:09:11 +00:00
+								    const size_t options_size = has_mss_option ? sizeof(TCPOptionMSS) : 0;
-												Kernel: Avoid allocations when sending IP packets

Previously we'd allocate buffers when sending packets. This patch
avoids these allocations by using the NetworkAdapter's packet queue.

At the same time this also avoids copying partially constructed
packets in order to prepend Ethernet and/or IPv4 headers. It also
properly truncates UDP and raw IP packets.

											
										
										
											2021-05-26 03:35:05 +00:00
+								    const size_t tcp_header_size = sizeof(TCPPacket) + options_size;
 								    const size_t buffer_size = ipv4_payload_offset + tcp_header_size + payload_size;
 								    auto packet = routing_decision.adapter->acquire_packet_buffer(buffer_size);
 								    if (!packet)
-												Kernel: Add so_error to keep track of the socket's error state

This sets the m_so_error variable every time the socket returns an
error.

											
										
										
											2021-08-01 15:27:23 +00:00
+								        return set_so_error(ENOMEM);
-												Kernel: Avoid allocations when sending IP packets

Previously we'd allocate buffers when sending packets. This patch
avoids these allocations by using the NetworkAdapter's packet queue.

At the same time this also avoids copying partially constructed
packets in order to prepend Ethernet and/or IPv4 headers. It also
properly truncates UDP and raw IP packets.

											
										
										
											2021-05-26 03:35:05 +00:00
+								    routing_decision.adapter->fill_in_ipv4_header(*packet, local_address(),
 								        routing_decision.next_hop, peer_address(), IPv4Protocol::TCP,
-												Kernel+LibC: Add support for the IPv4 TOS field via the IP_TOS sockopt

											
										
										
											2021-10-27 20:20:24 +00:00
+								        buffer_size - ipv4_payload_offset, type_of_service(), ttl());
-												Kernel: Handle OOM when allocating Packet KBuffers

											
										
										
											2021-08-01 12:11:49 +00:00
+								    memset(packet->buffer->data() + ipv4_payload_offset, 0, sizeof(TCPPacket));
 								    auto& tcp_packet = *(TCPPacket*)(packet->buffer->data() + ipv4_payload_offset);
-												Everywhere: Rename ASSERT => VERIFY

(...and ASSERT_NOT_REACHED => VERIFY_NOT_REACHED)

Since all of these checks are done in release builds as well,
let's rename them to VERIFY to prevent confusion, as everyone is
used to assertions being compiled out in release.

We can introduce a new ASSERT macro that is specifically for debug
checks, but I'm doing this wholesale conversion first since we've
accumulated thousands of these already, and it's not immediately
obvious which ones are suitable for ASSERT.

											
										
										
											2021-02-23 19:42:32 +00:00
+								    VERIFY(local_port());
-												IPv4: Rename source/destination in socket classes to local/peer.

It was way too ambiguous who's the source and who's the destination, and it
didn't really follow a logical pattern. "Local port" vs "Peer port" is super
obvious, so let's call it that.

											
										
										
											2019-05-04 14:40:34 +00:00
+								    tcp_packet.set_source_port(local_port());
 								    tcp_packet.set_destination_port(peer_port());
-												Kernel: Increase the default TCP window size

This increases the default TCP window size to a more reasonable
value of 64k. This allows TCP peers to send us more packets before
waiting for corresponding ACKs.

											
										
										
											2021-05-11 14:18:33 +00:00
+								    tcp_packet.set_window_size(NumericLimits<u16>::max());
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								    tcp_packet.set_sequence_number(m_sequence_number);
-												Kernel: Avoid allocations when sending IP packets

Previously we'd allocate buffers when sending packets. This patch
avoids these allocations by using the NetworkAdapter's packet queue.

At the same time this also avoids copying partially constructed
packets in order to prepend Ethernet and/or IPv4 headers. It also
properly truncates UDP and raw IP packets.

											
										
										
											2021-05-26 03:35:05 +00:00
+								    tcp_packet.set_data_offset(tcp_header_size / sizeof(u32));
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								    tcp_packet.set_flags(flags);
-												Kernel: Make UserOrKernelBuffer return KResult from read/write/memset

This allows us to simplify a whole bunch of call sites with TRY(). :^)

											
										
										
											2021-09-07 10:09:52 +00:00
+								    if (payload) {
 								        if (auto result = payload->read(tcp_packet.payload(), payload_size); result.is_error()) {
 								            routing_decision.adapter->release_packet_buffer(*packet);
-												Kernel: Replace KResult and KResultOr<T> with Error and ErrorOr<T>

We now use AK::Error and AK::ErrorOr<T> in both kernel and userspace!
This was a slightly tedious refactoring that took a long time, so it's
not unlikely that some bugs crept in.

Nevertheless, it does pass basic functionality testing, and it's just
real nice to finally see the same pattern in all contexts. :^)

											
										
										
											2021-11-07 23:51:39 +00:00
+								            return set_so_error(result.release_error());
-												Kernel: Make UserOrKernelBuffer return KResult from read/write/memset

This allows us to simplify a whole bunch of call sites with TRY(). :^)

											
										
										
											2021-09-07 10:09:52 +00:00
+								        }
-												Kernel: Release packet buffer in TCPSocket::send_tcp_packet

Previously we wouldn't release the buffer back to the network adapter
in all cases. While this didn't leak the buffer it would cause the
buffer to not be reused for other packets.

											
										
										
											2021-05-27 22:49:53 +00:00
+								    }
-												Kernel: Make copy_to/from_user safe and remove unnecessary checks

Since the CPU already does almost all necessary validation steps
for us, we don't really need to attempt to do this. Doing it
ourselves doesn't really work very reliably, because we'd have to
account for other processors modifying virtual memory, and we'd
have to account for e.g. pages not being able to be allocated
due to insufficient resources.

So change the copy_to/from_user (and associated helper functions)
to use the new safe_memcpy, which will return whether it succeeded
or not. The only manual validation step needed (which the CPU
can't perform for us) is making sure the pointers provided by user
mode aren't pointing to kernel mappings.

To make it easier to read/write from/to either kernel or user mode
data add the UserOrKernelBuffer helper class, which will internally
either use copy_from/to_user or directly memcpy, or pass the data
through directly using a temporary buffer on the stack.

Last but not least we need to keep syscall params trivial as we
need to copy them from/to user mode using copy_from/to_user.

											
										
										
											2020-09-12 03:11:07 +00:00
-												Kernel/Net: Don't update TCP socket "last sent ACK" field too early

Defer updating this field until after the last fallible operation has
succeeded.

											
										
										
											2022-02-11 11:38:10 +00:00
+								    if (flags & TCPFlags::ACK) {
 								        m_last_ack_number_sent = m_ack_number;
-												Kernel/Net: Use monotonic time for TCP times

These were using real time as a mistake before; changing the system time
during ongoing TCP connections shouldn’t break them.

											
										
										
											2023-08-17 17:20:42 +00:00
+								        m_last_ack_sent_time = TimeManagement::the().monotonic_time();
-												Kernel/Net: Don't update TCP socket "last sent ACK" field too early

Defer updating this field until after the last fallible operation has
succeeded.

											
										
										
											2022-02-11 11:38:10 +00:00
+								        tcp_packet.set_ack_number(m_ack_number);
 								    }
-												Kernel: Implement TCP listening sockets and incoming connections

											
										
										
											2019-08-09 02:48:28 +00:00
+								    if (flags & TCPFlags::SYN) {
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								        ++m_sequence_number;
 								    } else {
 								        m_sequence_number += payload_size;
 								    }
-												Kernel: Set MSS option for outbound TCP SYN packets

When the MSS option header is missing the default maximum segment
size is 536 which results in lots of very small TCP packets that
NetworkTask has to handle.

This adds the MSS option header to outbound TCP SYN packets and
sets it to an appropriate value depending on the interface's MTU.

Note that we do not currently do path MTU discovery so this could
cause problems when hops don't fragment packets properly.

											
										
										
											2021-05-11 19:09:11 +00:00
+								    if (has_mss_option) {
 								        u16 mss = routing_decision.adapter->mtu() - sizeof(IPv4Packet) - sizeof(TCPPacket);
 								        TCPOptionMSS mss_option { mss };
-												Kernel: Handle OOM when allocating Packet KBuffers

											
										
										
											2021-08-01 12:11:49 +00:00
+								        VERIFY(packet->buffer->size() >= ipv4_payload_offset + sizeof(TCPPacket) + sizeof(mss_option));
 								        memcpy(packet->buffer->data() + ipv4_payload_offset + sizeof(TCPPacket), &mss_option, sizeof(mss_option));
-												Kernel: Set MSS option for outbound TCP SYN packets

When the MSS option header is missing the default maximum segment
size is 536 which results in lots of very small TCP packets that
NetworkTask has to handle.

This adds the MSS option header to outbound TCP SYN packets and
sets it to an appropriate value depending on the interface's MTU.

Note that we do not currently do path MTU discovery so this could
cause problems when hops don't fragment packets properly.

											
										
										
											2021-05-11 19:09:11 +00:00
+								    }
-												Kernel: Refactor TCP/IP stack

This has several significant changes to the networking stack.

* Significant refactoring of the TCP state machine. Right now it's
  probably more fragile than it used to be, but handles quite a lot
  more of the handshake process.
* `TCPSocket` holds a `NetworkAdapter*`, assigned during `connect()` or
  `bind()`, whichever comes first.
* `listen()` is now virtual in `Socket` and intended to be implemented
  in its child classes
* `listen()` no longer works without `bind()` - this is a bit of a
  regression, but listening sockets didn't work at all before, so it's
  not possible to observe the regression.
* A file is exposed at `/proc/net_tcp`, which is a JSON document listing
  the current TCP sockets with a bit of metadata.
* There's an `ETHERNET_VERY_DEBUG` flag for dumping packet's content out
  to `kprintf`. It is, indeed, _very debug_.

											
										
										
											2019-08-06 13:40:38 +00:00
+								    tcp_packet.set_checksum(compute_tcp_checksum(local_address(), peer_address(), tcp_packet, payload_size));
-												Kernel: Implement outgoing TCP retransmission and better ACK handling

This approach is a bit naiive - whenever we send a packet out, we
check to see if there are any other packets we should try to send.
This works well enough for a busy connection but not very well for a
quiet one. Ideally we would check for not-acked packets on some kind
of timer, and use the length of this not-acked list as feedback to
throttle the writes coming from userspace.

											
										
										
											2019-09-08 07:38:08 +00:00
-												AK+Kernel: Handle some allocation failures in IPv4Socket and TCPSocket

This adds try_* methods to AK::SinglyLinkedList and
AK::SinglyLinkedListWithCount and updates the network stack to use
those to gracefully handle allocation failures.

Refs #6369.

											
										
										
											2022-11-01 09:04:13 +00:00
+								    bool expect_ack { tcp_packet.has_syn() || payload_size > 0 };
 								    if (expect_ack) {
 								        bool append_failed { false };
-												Kernel/TCP: Port TCP retransmit queue to ProtectedValue

I had to switch to exclusive locking since ProtectedValue rightly
doesn't allow you to mutate protected data with only a shared lock.

											
										
										
											2021-08-07 13:42:11 +00:00
+								        m_unacked_packets.with_exclusive([&](auto& unacked_packets) {
-												AK+Kernel: Handle some allocation failures in IPv4Socket and TCPSocket

This adds try_* methods to AK::SinglyLinkedList and
AK::SinglyLinkedListWithCount and updates the network stack to use
those to gracefully handle allocation failures.

Refs #6369.

											
										
										
											2022-11-01 09:04:13 +00:00
+								            auto result = unacked_packets.packets.try_append({ m_sequence_number, packet, ipv4_payload_offset, *routing_decision.adapter });
 								            if (result.is_error()) {
 								                dbgln("TCPSocket: Dropped outbound packet because try_append() failed");
 								                append_failed = true;
 								                return;
 								            }
-												Kernel/TCP: Port TCP retransmit queue to ProtectedValue

I had to switch to exclusive locking since ProtectedValue rightly
doesn't allow you to mutate protected data with only a shared lock.

											
										
										
											2021-08-07 13:42:11 +00:00
+								            unacked_packets.size += payload_size;
 								            enqueue_for_retransmit();
 								        });
-												AK+Kernel: Handle some allocation failures in IPv4Socket and TCPSocket

This adds try_* methods to AK::SinglyLinkedList and
AK::SinglyLinkedListWithCount and updates the network stack to use
those to gracefully handle allocation failures.

Refs #6369.

											
										
										
											2022-11-01 09:04:13 +00:00
+								        if (append_failed)
 								            return set_so_error(ENOMEM);
-												Kernel: Try to retransmit lost TCP packets

Previously we didn't retransmit lost TCP packets which would cause
connections to hang if packets were lost. Also we now time out
TCP connections after a number of retransmission attempts.

											
										
										
											2021-05-13 08:49:10 +00:00
+								    }
-												AK+Kernel: Handle some allocation failures in IPv4Socket and TCPSocket

This adds try_* methods to AK::SinglyLinkedList and
AK::SinglyLinkedListWithCount and updates the network stack to use
those to gracefully handle allocation failures.

Refs #6369.

											
										
										
											2022-11-01 09:04:13 +00:00
+								    m_packets_out++;
 								    m_bytes_out += buffer_size;
 								    routing_decision.adapter->send_packet(packet->bytes());
 								    if (!expect_ack)
 								        routing_decision.adapter->release_packet_buffer(*packet);
-												Kernel: Replace KResult and KResultOr<T> with Error and ErrorOr<T>

We now use AK::Error and AK::ErrorOr<T> in both kernel and userspace!
This was a slightly tedious refactoring that took a long time, so it's
not unlikely that some bugs crept in.

Nevertheless, it does pass basic functionality testing, and it's just
real nice to finally see the same pattern in all contexts. :^)

											
										
										
											2021-11-07 23:51:39 +00:00
+								    return {};
-												Kernel: Implement outgoing TCP retransmission and better ACK handling

This approach is a bit naiive - whenever we send a packet out, we
check to see if there are any other packets we should try to send.
This works well enough for a busy connection but not very well for a
quiet one. Ideally we would check for not-acked packets on some kind
of timer, and use the length of this not-acked list as feedback to
throttle the writes coming from userspace.

											
										
										
											2019-09-08 07:38:08 +00:00
+								}
-												Everywhere: Run clang-format

											
										
										
											2022-04-01 17:58:27 +00:00
+								void TCPSocket::receive_tcp_packet(TCPPacket const& packet, u16 size)
-												Kernel: Record network statistics and expose as JSON

This is comprised of five small changes:

* Keep a counter for tx/rx packets/bytes per TCP socket
* Keep a counter for tx/rx packets/bytes per network adapter
* Expose that data in /proc/net_tcp and /proc/netadapters
* Convert /proc/netadapters to JSON
* Fix up ifconfig to read the JSON from netadapters

											
										
										
											2019-08-08 02:32:35 +00:00
+								{
-												Kernel: Implement outgoing TCP retransmission and better ACK handling

This approach is a bit naiive - whenever we send a packet out, we
check to see if there are any other packets we should try to send.
This works well enough for a busy connection but not very well for a
quiet one. Ideally we would check for not-acked packets on some kind
of timer, and use the length of this not-acked list as feedback to
throttle the writes coming from userspace.

											
										
										
											2019-09-08 07:38:08 +00:00
+								    if (packet.has_ack()) {
 								        u32 ack_number = packet.ack_number();
-												Everywhere: Replace dbgln<flag>(...) with dbgln_if(flag, ...)

Replacement made by `find Kernel Userland -name '*.h' -o -name '*.cpp' | sed -i -Ee 's/dbgln\b<(\w+)>\(/dbgln_if(\1, /g'`

											
										
										
											2021-02-07 12:03:24 +00:00
+								        dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket: receive_tcp_packet: {}", ack_number);
-												Kernel: Implement outgoing TCP retransmission and better ACK handling

This approach is a bit naiive - whenever we send a packet out, we
check to see if there are any other packets we should try to send.
This works well enough for a busy connection but not very well for a
quiet one. Ideally we would check for not-acked packets on some kind
of timer, and use the length of this not-acked list as feedback to
throttle the writes coming from userspace.

											
										
										
											2019-09-08 07:38:08 +00:00
 								        int removed = 0;
-												Kernel/TCP: Port TCP retransmit queue to ProtectedValue

I had to switch to exclusive locking since ProtectedValue rightly
doesn't allow you to mutate protected data with only a shared lock.

											
										
										
											2021-08-07 13:42:11 +00:00
+								        m_unacked_packets.with_exclusive([&](auto& unacked_packets) {
 								            while (!unacked_packets.packets.is_empty()) {
 								                auto& packet = unacked_packets.packets.first();
 								                dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket: iterate: {}", packet.ack_number);
 								                if (packet.ack_number <= ack_number) {
 								                    auto old_adapter = packet.adapter.strong_ref();
 								                    if (old_adapter)
 								                        old_adapter->release_packet_buffer(*packet.buffer);
 								                    TCPPacket& tcp_packet = *(TCPPacket*)(packet.buffer->buffer->data() + packet.ipv4_payload_offset);
-												Kernel/TCPSocket: Read window size from peer

During receive_tcp_packet(), we now set m_send_window_size for the
socket if it is different from the default.

This removes one FIXME from TCPSocket.h.

											
										
										
											2023-06-19 02:05:00 +00:00
+								                    if (m_send_window_size != tcp_packet.window_size()) {
 								                        m_send_window_size = tcp_packet.window_size();
 								                    }
-												Kernel/TCP: Port TCP retransmit queue to ProtectedValue

I had to switch to exclusive locking since ProtectedValue rightly
doesn't allow you to mutate protected data with only a shared lock.

											
										
										
											2021-08-07 13:42:11 +00:00
+								                    auto payload_size = packet.buffer->buffer->data() + packet.buffer->buffer->size() - (u8*)tcp_packet.payload();
 								                    unacked_packets.size -= payload_size;
 								                    evaluate_block_conditions();
 								                    unacked_packets.packets.take_first();
 								                    removed++;
 								                } else {
 								                    break;
 								                }
-												Kernel: Implement outgoing TCP retransmission and better ACK handling

This approach is a bit naiive - whenever we send a packet out, we
check to see if there are any other packets we should try to send.
This works well enough for a busy connection but not very well for a
quiet one. Ideally we would check for not-acked packets on some kind
of timer, and use the length of this not-acked list as feedback to
throttle the writes coming from userspace.

											
										
										
											2019-09-08 07:38:08 +00:00
+								            }
-												Kernel/TCP: Port TCP retransmit queue to ProtectedValue

I had to switch to exclusive locking since ProtectedValue rightly
doesn't allow you to mutate protected data with only a shared lock.

											
										
										
											2021-08-07 13:42:11 +00:00
+								            if (unacked_packets.packets.is_empty()) {
 								                m_retransmit_attempts = 0;
 								                dequeue_for_retransmit();
 								            }
-												Kernel: Try to retransmit lost TCP packets

Previously we didn't retransmit lost TCP packets which would cause
connections to hang if packets were lost. Also we now time out
TCP connections after a number of retransmission attempts.

											
										
										
											2021-05-13 08:49:10 +00:00
-												Kernel/TCP: Port TCP retransmit queue to ProtectedValue

I had to switch to exclusive locking since ProtectedValue rightly
doesn't allow you to mutate protected data with only a shared lock.

											
										
										
											2021-08-07 13:42:11 +00:00
+								            dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket: receive_tcp_packet acknowledged {} packets", removed);
 								        });
-												Kernel: Implement outgoing TCP retransmission and better ACK handling

This approach is a bit naiive - whenever we send a packet out, we
check to see if there are any other packets we should try to send.
This works well enough for a busy connection but not very well for a
quiet one. Ideally we would check for not-acked packets on some kind
of timer, and use the length of this not-acked list as feedback to
throttle the writes coming from userspace.

											
										
										
											2019-09-08 07:38:08 +00:00
+								    }
-												Kernel: Record network statistics and expose as JSON

This is comprised of five small changes:

* Keep a counter for tx/rx packets/bytes per TCP socket
* Keep a counter for tx/rx packets/bytes per network adapter
* Expose that data in /proc/net_tcp and /proc/netadapters
* Convert /proc/netadapters to JSON
* Fix up ifconfig to read the JSON from netadapters

											
										
										
											2019-08-08 02:32:35 +00:00
+								    m_packets_in++;
-												Kernel: Implement outgoing TCP retransmission and better ACK handling

This approach is a bit naiive - whenever we send a packet out, we
check to see if there are any other packets we should try to send.
This works well enough for a busy connection but not very well for a
quiet one. Ideally we would check for not-acked packets on some kind
of timer, and use the length of this not-acked list as feedback to
throttle the writes coming from userspace.

											
										
										
											2019-09-08 07:38:08 +00:00
+								    m_bytes_in += packet.header_size() + size;
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								}
-												Kernel: Coalesce TCP ACKs

Previously we'd send a TCP ACK for each TCP packet we received. This
changes NetworkTask so that we send fewer TCP ACKs.

											
										
										
											2021-05-12 07:14:37 +00:00
+								bool TCPSocket::should_delay_next_ack() const
 								{
 								    // FIXME: We don't know the MSS here so make a reasonable guess.
 								    const size_t mss = 1500;
 								    // RFC 1122 says we should send an ACK for every two full-sized segments.
 								    if (m_ack_number >= m_last_ack_number_sent + 2 * mss)
 								        return false;
 								    // RFC 1122 says we should not delay ACKs for more than 500 milliseconds.
-												Kernel/Net: Use monotonic time for TCP times

These were using real time as a mistake before; changing the system time
during ongoing TCP connections shouldn’t break them.

											
										
										
											2023-08-17 17:20:42 +00:00
+								    if (TimeManagement::the().monotonic_time(TimePrecision::Precise) >= m_last_ack_sent_time + Duration::from_milliseconds(500))
-												Kernel: Coalesce TCP ACKs

Previously we'd send a TCP ACK for each TCP packet we received. This
changes NetworkTask so that we send fewer TCP ACKs.

											
										
										
											2021-05-12 07:14:37 +00:00
+								        return false;
 								    return true;
 								}
-												Everywhere: Run clang-format

											
										
										
											2022-04-01 17:58:27 +00:00
+								NetworkOrdered<u16> TCPSocket::compute_tcp_checksum(IPv4Address const& source, IPv4Address const& destination, TCPPacket const& packet, u16 payload_size)
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								{
-												Kernel: Convert TCP pseudo-headers through a union

This keeps us from tripping strict aliasing, which previously made TCP
connections inoperable when building without `-fsanitize=undefined` or
`-fno-strict-aliasing`.

											
										
										
											2022-12-12 16:52:00 +00:00
+								    union PseudoHeader {
 								        struct [[gnu::packed]] {
 								            IPv4Address source;
 								            IPv4Address destination;
 								            u8 zero;
 								            u8 protocol;
 								            NetworkOrdered<u16> payload_size;
 								        } header;
 								        u16 raw[6];
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								    };
-												Kernel: Convert TCP pseudo-headers through a union

This keeps us from tripping strict aliasing, which previously made TCP
connections inoperable when building without `-fsanitize=undefined` or
`-fno-strict-aliasing`.

											
										
										
											2022-12-12 16:52:00 +00:00
+								    static_assert(sizeof(PseudoHeader) == 12);
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
-												Kernel: Check against TCP packet size overflows in checksum calculation

											
										
										
											2022-12-13 11:39:52 +00:00
+								    Checked<u16> packet_size = packet.header_size();
 								    packet_size += payload_size;
 								    VERIFY(!packet_size.has_overflow());
 								    PseudoHeader pseudo_header { .header = { source, destination, 0, (u8)IPv4Protocol::TCP, packet_size.value() } };
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
-												AK: Rename the common integer typedefs to make it obvious what they are.

These types can be picked up by including <AK/Types.h>:

* u8, u16, u32, u64 (unsigned)
* i8, i16, i32, i64 (signed)

											
										
										
											2019-07-03 19:17:35 +00:00
+								    u32 checksum = 0;
-												Kernel: Convert TCP pseudo-headers through a union

This keeps us from tripping strict aliasing, which previously made TCP
connections inoperable when building without `-fsanitize=undefined` or
`-fno-strict-aliasing`.

											
										
										
											2022-12-12 16:52:00 +00:00
+								    auto* raw_pseudo_header = pseudo_header.raw;
-												AK: Rename the common integer typedefs to make it obvious what they are.

These types can be picked up by including <AK/Types.h>:

* u8, u16, u32, u64 (unsigned)
* i8, i16, i32, i64 (signed)

											
										
										
											2019-07-03 19:17:35 +00:00
+								    for (size_t i = 0; i < sizeof(pseudo_header) / sizeof(u16); ++i) {
-												Kernel: Don't cast to NetworkOrdered<u16>* from random data

NetworkOrdered is a non trivial type, and it's undefined behavior to
cast a random pointer to it and then pretend it's that type.

Instead just call AK::convert_between_host_and_network_endian on the
individual u16*. This suppresses static analysis warnings.

I don't think there was a "bug" in the previous code, it worked, but
it was very brittle.

											
										
										
											2021-09-01 07:27:42 +00:00
+								        checksum += AK::convert_between_host_and_network_endian(raw_pseudo_header[i]);
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								        if (checksum > 0xffff)
 								            checksum = (checksum >> 16) + (checksum & 0xffff);
 								    }
-												Kernel: Convert TCP pseudo-headers through a union

This keeps us from tripping strict aliasing, which previously made TCP
connections inoperable when building without `-fsanitize=undefined` or
`-fno-strict-aliasing`.

											
										
										
											2022-12-12 16:52:00 +00:00
+								    auto* raw_packet = bit_cast<u16*>(&packet);
-												Kernel: Set MSS option for outbound TCP SYN packets

When the MSS option header is missing the default maximum segment
size is 536 which results in lots of very small TCP packets that
NetworkTask has to handle.

This adds the MSS option header to outbound TCP SYN packets and
sets it to an appropriate value depending on the interface's MTU.

Note that we do not currently do path MTU discovery so this could
cause problems when hops don't fragment packets properly.

											
										
										
											2021-05-11 19:09:11 +00:00
+								    for (size_t i = 0; i < packet.header_size() / sizeof(u16); ++i) {
-												Kernel: Don't cast to NetworkOrdered<u16>* from random data

NetworkOrdered is a non trivial type, and it's undefined behavior to
cast a random pointer to it and then pretend it's that type.

Instead just call AK::convert_between_host_and_network_endian on the
individual u16*. This suppresses static analysis warnings.

I don't think there was a "bug" in the previous code, it worked, but
it was very brittle.

											
										
										
											2021-09-01 07:27:42 +00:00
+								        checksum += AK::convert_between_host_and_network_endian(raw_packet[i]);
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								        if (checksum > 0xffff)
 								            checksum = (checksum >> 16) + (checksum & 0xffff);
 								    }
-												Kernel: Set MSS option for outbound TCP SYN packets

When the MSS option header is missing the default maximum segment
size is 536 which results in lots of very small TCP packets that
NetworkTask has to handle.

This adds the MSS option header to outbound TCP SYN packets and
sets it to an appropriate value depending on the interface's MTU.

Note that we do not currently do path MTU discovery so this could
cause problems when hops don't fragment packets properly.

											
										
										
											2021-05-11 19:09:11 +00:00
+								    VERIFY(packet.data_offset() * 4 == packet.header_size());
-												Kernel: Convert TCP pseudo-headers through a union

This keeps us from tripping strict aliasing, which previously made TCP
connections inoperable when building without `-fsanitize=undefined` or
`-fno-strict-aliasing`.

											
										
										
											2022-12-12 16:52:00 +00:00
+								    auto* raw_payload = bit_cast<u16*>(packet.payload());
-												AK: Rename the common integer typedefs to make it obvious what they are.

These types can be picked up by including <AK/Types.h>:

* u8, u16, u32, u64 (unsigned)
* i8, i16, i32, i64 (signed)

											
										
										
											2019-07-03 19:17:35 +00:00
+								    for (size_t i = 0; i < payload_size / sizeof(u16); ++i) {
-												Kernel: Don't cast to NetworkOrdered<u16>* from random data

NetworkOrdered is a non trivial type, and it's undefined behavior to
cast a random pointer to it and then pretend it's that type.

Instead just call AK::convert_between_host_and_network_endian on the
individual u16*. This suppresses static analysis warnings.

I don't think there was a "bug" in the previous code, it worked, but
it was very brittle.

											
										
										
											2021-09-01 07:27:42 +00:00
+								        checksum += AK::convert_between_host_and_network_endian(raw_payload[i]);
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								        if (checksum > 0xffff)
 								            checksum = (checksum >> 16) + (checksum & 0xffff);
 								    }
 								    if (payload_size & 1) {
-												Everywhere: Run clang-format

											
										
										
											2022-04-01 17:58:27 +00:00
+								        u16 expanded_byte = ((u8 const*)packet.payload())[payload_size - 1] << 8;
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								        checksum += expanded_byte;
 								        if (checksum > 0xffff)
 								            checksum = (checksum >> 16) + (checksum & 0xffff);
 								    }
 								    return ~(checksum & 0xffff);
 								}
-												Kernel: Replace KResult and KResultOr<T> with Error and ErrorOr<T>

We now use AK::Error and AK::ErrorOr<T> in both kernel and userspace!
This was a slightly tedious refactoring that took a long time, so it's
not unlikely that some bugs crept in.

Nevertheless, it does pass basic functionality testing, and it's just
real nice to finally see the same pattern in all contexts. :^)

											
										
										
											2021-11-07 23:51:39 +00:00
+								ErrorOr<void> TCPSocket::protocol_bind()
-												Kernel: Refactor TCP/IP stack

This has several significant changes to the networking stack.

* Significant refactoring of the TCP state machine. Right now it's
  probably more fragile than it used to be, but handles quite a lot
  more of the handshake process.
* `TCPSocket` holds a `NetworkAdapter*`, assigned during `connect()` or
  `bind()`, whichever comes first.
* `listen()` is now virtual in `Socket` and intended to be implemented
  in its child classes
* `listen()` no longer works without `bind()` - this is a bit of a
  regression, but listening sockets didn't work at all before, so it's
  not possible to observe the regression.
* A file is exposed at `/proc/net_tcp`, which is a JSON document listing
  the current TCP sockets with a bit of metadata.
* There's an `ETHERNET_VERY_DEBUG` flag for dumping packet's content out
  to `kprintf`. It is, indeed, _very debug_.

											
										
										
											2019-08-06 13:40:38 +00:00
+								{
-												Kernel/Net: Rework ephemeral port allocation

Currently, ephemeral port allocation is handled by the
allocate_local_port_if_needed() and protocol_allocate_local_port()
methods. Actually binding the socket to an address (which means
inserting the socket/address pair into a global map) is performed either
in protocol_allocate_local_port() (for ephemeral ports) or in
protocol_listen() (for non-ephemeral ports); the latter will fail with
EADDRINUSE if the address is already used by an existing pair present in
the map.

There used to be a bug where for listen() without an explicit bind(),
the port allocation would conflict with itself: first an ephemeral port
would get allocated and inserted into the map, and then
protocol_listen() would check again for the port being free, find the
just-created map entry, and error out. This was fixed in commit
01e5af487f9513696dbcacab15d3e0036446f586 by passing an additional flag
did_allocate_port into protocol_listen() which specifies whether the
port was just allocated, and skipping the check in protocol_listen() if
the flag is set.

However, this only helps if the socket is bound to an ephemeral port
inside of this very listen() call. But calling bind(sin_port = 0) from
userspace should succeed and bind to an allocated ephemeral port, in the
same was as using an unbound socket for connect() does. The port number
can then be retrieved from userspace by calling getsockname (), and it
should be possible to either connect() or listen() on this socket,
keeping the allocated port number. Also, calling bind() when already
bound (either explicitly or implicitly) should always result in EINVAL.

To untangle this, introduce an explicit m_bound state in IPv4Socket,
just like LocalSocket has already. Once a socket is bound, further
attempt to bind it fail. Some operations cause the socket to implicitly
get bound to an (ephemeral) address; this is implemented by the new
ensure_bound() method. The protocol_allocate_local_port() method is
gone; it is now up to a protocol to assign a port to the socket inside
protocol_bind() if it finds that the socket has local_port() == 0.

protocol_bind() is now called in more cases, such as inside listen() if
the socket wasn't bound before that.

											
										
										
											2023-07-23 12:43:45 +00:00
+								    dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket::protocol_bind(), local_port() is {}", local_port());
 								    // Check that we do have the address we're trying to bind to.
 								    TRY(m_adapter.with([this](auto& adapter) -> ErrorOr<void> {
-												Kernel/Net: Iron out the locking mechanism across the subsystem

There is a big mix of LockRefPtrs all over the Networking subsystem, as
well as lots of room for improvements with our locking patterns, which
this commit will not pursue, but will give a good start for such work.

To deal with this situation, we change the following things:
- Creating instances of NetworkAdapter should always yield a non-locking
  NonnullRefPtr. Acquiring an instance from the NetworkingManagement
  should give a simple RefPtr,as giving LockRefPtr does not really
  protect from concurrency problems in such case.
- Since NetworkingManagement works with normal RefPtrs we should
  protect all instances of RefPtr<NetworkAdapter> with SpinlockProtected
  to ensure references are gone unexpectedly.
- Protect the so_error class member with a proper spinlock. This happens
  to be important because the clear_so_error() method lacked any proper
  locking measures. It also helps preventing a possible TOCTOU when we
  might do a more fine-grained locking in the Socket code, so this could
  be definitely a start for this.
- Change unnecessary LockRefPtr<PacketWithTimestamp> in the structure
  of OutgoingPacket to a simple RefPtr<PacketWithTimestamp> as the whole
  list should be MutexProtected.

											
										
										
											2023-04-11 00:50:15 +00:00
+								        if (has_specific_local_address() && !adapter) {
 								            adapter = NetworkingManagement::the().from_ipv4_address(local_address());
 								            if (!adapter)
 								                return set_so_error(EADDRNOTAVAIL);
 								        }
 								        return {};
-												Kernel/Net: Rework ephemeral port allocation

Currently, ephemeral port allocation is handled by the
allocate_local_port_if_needed() and protocol_allocate_local_port()
methods. Actually binding the socket to an address (which means
inserting the socket/address pair into a global map) is performed either
in protocol_allocate_local_port() (for ephemeral ports) or in
protocol_listen() (for non-ephemeral ports); the latter will fail with
EADDRINUSE if the address is already used by an existing pair present in
the map.

There used to be a bug where for listen() without an explicit bind(),
the port allocation would conflict with itself: first an ephemeral port
would get allocated and inserted into the map, and then
protocol_listen() would check again for the port being free, find the
just-created map entry, and error out. This was fixed in commit
01e5af487f9513696dbcacab15d3e0036446f586 by passing an additional flag
did_allocate_port into protocol_listen() which specifies whether the
port was just allocated, and skipping the check in protocol_listen() if
the flag is set.

However, this only helps if the socket is bound to an ephemeral port
inside of this very listen() call. But calling bind(sin_port = 0) from
userspace should succeed and bind to an allocated ephemeral port, in the
same was as using an unbound socket for connect() does. The port number
can then be retrieved from userspace by calling getsockname (), and it
should be possible to either connect() or listen() on this socket,
keeping the allocated port number. Also, calling bind() when already
bound (either explicitly or implicitly) should always result in EINVAL.

To untangle this, introduce an explicit m_bound state in IPv4Socket,
just like LocalSocket has already. Once a socket is bound, further
attempt to bind it fail. Some operations cause the socket to implicitly
get bound to an (ephemeral) address; this is implemented by the new
ensure_bound() method. The protocol_allocate_local_port() method is
gone; it is now up to a protocol to assign a port to the socket inside
protocol_bind() if it finds that the socket has local_port() == 0.

protocol_bind() is now called in more cases, such as inside listen() if
the socket wasn't bound before that.

											
										
										
											2023-07-23 12:43:45 +00:00
+								    }));
 								    if (local_port() == 0) {
 								        // Allocate an unused ephemeral port.
 								        constexpr u16 first_ephemeral_port = 32768;
 								        constexpr u16 last_ephemeral_port = 60999;
 								        constexpr u16 ephemeral_port_range_size = last_ephemeral_port - first_ephemeral_port;
 								        u16 first_scan_port = first_ephemeral_port + get_good_random<u16>() % ephemeral_port_range_size;
 								        return sockets_by_tuple().with_exclusive([&](auto& table) -> ErrorOr<void> {
 								            u16 port = first_scan_port;
 								            while (true) {
 								                IPv4SocketTuple proposed_tuple(local_address(), port, peer_address(), peer_port());
 								                auto it = table.find(proposed_tuple);
 								                if (it == table.end()) {
 								                    set_local_port(port);
 								                    table.set(proposed_tuple, this);
 								                    dbgln_if(TCP_SOCKET_DEBUG, "...allocated port {}, tuple {}", port, proposed_tuple.to_string());
 								                    return {};
 								                }
 								                ++port;
 								                if (port > last_ephemeral_port)
 								                    port = first_ephemeral_port;
 								                if (port == first_scan_port)
 								                    break;
 								            }
 								            return set_so_error(EADDRINUSE);
 								        });
 								    } else {
 								        // Verify that the user-supplied port is not already used by someone else.
-												Kernel: Migrate TCP socket tables locking to ProtectedValue

Note: TCPSocket::create_client() has a dubious locking process where
the sockets by tuple table is first shared lock to check if the socket
exists and bail out if it does, then unlocks, then exclusively locks to
add the tuple. There could be a race condition where two client
creation requests for the same tuple happen at the same time and both
cleared the shared lock check. When in doubt, lock exclusively the
whole time.

											
										
										
											2021-07-18 10:14:43 +00:00
+								        bool ok = sockets_by_tuple().with_exclusive([&](auto& table) -> bool {
 								            if (table.contains(tuple()))
 								                return false;
 								            table.set(tuple(), this);
 								            return true;
 								        });
 								        if (!ok)
-												Kernel: Add so_error to keep track of the socket's error state

This sets the m_so_error variable every time the socket returns an
error.

											
										
										
											2021-08-01 15:27:23 +00:00
+								            return set_so_error(EADDRINUSE);
-												Kernel/Net: Rework ephemeral port allocation

Currently, ephemeral port allocation is handled by the
allocate_local_port_if_needed() and protocol_allocate_local_port()
methods. Actually binding the socket to an address (which means
inserting the socket/address pair into a global map) is performed either
in protocol_allocate_local_port() (for ephemeral ports) or in
protocol_listen() (for non-ephemeral ports); the latter will fail with
EADDRINUSE if the address is already used by an existing pair present in
the map.

There used to be a bug where for listen() without an explicit bind(),
the port allocation would conflict with itself: first an ephemeral port
would get allocated and inserted into the map, and then
protocol_listen() would check again for the port being free, find the
just-created map entry, and error out. This was fixed in commit
01e5af487f9513696dbcacab15d3e0036446f586 by passing an additional flag
did_allocate_port into protocol_listen() which specifies whether the
port was just allocated, and skipping the check in protocol_listen() if
the flag is set.

However, this only helps if the socket is bound to an ephemeral port
inside of this very listen() call. But calling bind(sin_port = 0) from
userspace should succeed and bind to an allocated ephemeral port, in the
same was as using an unbound socket for connect() does. The port number
can then be retrieved from userspace by calling getsockname (), and it
should be possible to either connect() or listen() on this socket,
keeping the allocated port number. Also, calling bind() when already
bound (either explicitly or implicitly) should always result in EINVAL.

To untangle this, introduce an explicit m_bound state in IPv4Socket,
just like LocalSocket has already. Once a socket is bound, further
attempt to bind it fail. Some operations cause the socket to implicitly
get bound to an (ephemeral) address; this is implemented by the new
ensure_bound() method. The protocol_allocate_local_port() method is
gone; it is now up to a protocol to assign a port to the socket inside
protocol_bind() if it finds that the socket has local_port() == 0.

protocol_bind() is now called in more cases, such as inside listen() if
the socket wasn't bound before that.

											
										
										
											2023-07-23 12:43:45 +00:00
+								        return {};
-												Kernel: Dont try to register ephemeral TCP ports twice

											
										
										
											2021-05-31 23:11:14 +00:00
+								    }
-												Kernel/Net: Rework ephemeral port allocation

Currently, ephemeral port allocation is handled by the
allocate_local_port_if_needed() and protocol_allocate_local_port()
methods. Actually binding the socket to an address (which means
inserting the socket/address pair into a global map) is performed either
in protocol_allocate_local_port() (for ephemeral ports) or in
protocol_listen() (for non-ephemeral ports); the latter will fail with
EADDRINUSE if the address is already used by an existing pair present in
the map.

There used to be a bug where for listen() without an explicit bind(),
the port allocation would conflict with itself: first an ephemeral port
would get allocated and inserted into the map, and then
protocol_listen() would check again for the port being free, find the
just-created map entry, and error out. This was fixed in commit
01e5af487f9513696dbcacab15d3e0036446f586 by passing an additional flag
did_allocate_port into protocol_listen() which specifies whether the
port was just allocated, and skipping the check in protocol_listen() if
the flag is set.

However, this only helps if the socket is bound to an ephemeral port
inside of this very listen() call. But calling bind(sin_port = 0) from
userspace should succeed and bind to an allocated ephemeral port, in the
same was as using an unbound socket for connect() does. The port number
can then be retrieved from userspace by calling getsockname (), and it
should be possible to either connect() or listen() on this socket,
keeping the allocated port number. Also, calling bind() when already
bound (either explicitly or implicitly) should always result in EINVAL.

To untangle this, introduce an explicit m_bound state in IPv4Socket,
just like LocalSocket has already. Once a socket is bound, further
attempt to bind it fail. Some operations cause the socket to implicitly
get bound to an (ephemeral) address; this is implemented by the new
ensure_bound() method. The protocol_allocate_local_port() method is
gone; it is now up to a protocol to assign a port to the socket inside
protocol_bind() if it finds that the socket has local_port() == 0.

protocol_bind() is now called in more cases, such as inside listen() if
the socket wasn't bound before that.

											
										
										
											2023-07-23 12:43:45 +00:00
+								}
-												Kernel: Dont try to register ephemeral TCP ports twice

											
										
										
											2021-05-31 23:11:14 +00:00
-												Kernel/Net: Rework ephemeral port allocation

Currently, ephemeral port allocation is handled by the
allocate_local_port_if_needed() and protocol_allocate_local_port()
methods. Actually binding the socket to an address (which means
inserting the socket/address pair into a global map) is performed either
in protocol_allocate_local_port() (for ephemeral ports) or in
protocol_listen() (for non-ephemeral ports); the latter will fail with
EADDRINUSE if the address is already used by an existing pair present in
the map.

There used to be a bug where for listen() without an explicit bind(),
the port allocation would conflict with itself: first an ephemeral port
would get allocated and inserted into the map, and then
protocol_listen() would check again for the port being free, find the
just-created map entry, and error out. This was fixed in commit
01e5af487f9513696dbcacab15d3e0036446f586 by passing an additional flag
did_allocate_port into protocol_listen() which specifies whether the
port was just allocated, and skipping the check in protocol_listen() if
the flag is set.

However, this only helps if the socket is bound to an ephemeral port
inside of this very listen() call. But calling bind(sin_port = 0) from
userspace should succeed and bind to an allocated ephemeral port, in the
same was as using an unbound socket for connect() does. The port number
can then be retrieved from userspace by calling getsockname (), and it
should be possible to either connect() or listen() on this socket,
keeping the allocated port number. Also, calling bind() when already
bound (either explicitly or implicitly) should always result in EINVAL.

To untangle this, introduce an explicit m_bound state in IPv4Socket,
just like LocalSocket has already. Once a socket is bound, further
attempt to bind it fail. Some operations cause the socket to implicitly
get bound to an (ephemeral) address; this is implemented by the new
ensure_bound() method. The protocol_allocate_local_port() method is
gone; it is now up to a protocol to assign a port to the socket inside
protocol_bind() if it finds that the socket has local_port() == 0.

protocol_bind() is now called in more cases, such as inside listen() if
the socket wasn't bound before that.

											
										
										
											2023-07-23 12:43:45 +00:00
+								ErrorOr<void> TCPSocket::protocol_listen()
 								{
-												Kernel: Implement TCP listening sockets and incoming connections

											
										
										
											2019-08-09 02:48:28 +00:00
+								    set_direction(Direction::Passive);
-												Kernel: Refactor TCP/IP stack

This has several significant changes to the networking stack.

* Significant refactoring of the TCP state machine. Right now it's
  probably more fragile than it used to be, but handles quite a lot
  more of the handshake process.
* `TCPSocket` holds a `NetworkAdapter*`, assigned during `connect()` or
  `bind()`, whichever comes first.
* `listen()` is now virtual in `Socket` and intended to be implemented
  in its child classes
* `listen()` no longer works without `bind()` - this is a bit of a
  regression, but listening sockets didn't work at all before, so it's
  not possible to observe the regression.
* A file is exposed at `/proc/net_tcp`, which is a JSON document listing
  the current TCP sockets with a bit of metadata.
* There's an `ETHERNET_VERY_DEBUG` flag for dumping packet's content out
  to `kprintf`. It is, indeed, _very debug_.

											
										
										
											2019-08-06 13:40:38 +00:00
+								    set_state(State::Listen);
-												Kernel: Use a more detailed state machine for socket setup

											
										
										
											2019-08-10 03:17:00 +00:00
+								    set_setup_state(SetupState::Completed);
-												Kernel: Replace KResult and KResultOr<T> with Error and ErrorOr<T>

We now use AK::Error and AK::ErrorOr<T> in both kernel and userspace!
This was a slightly tedious refactoring that took a long time, so it's
not unlikely that some bugs crept in.

Nevertheless, it does pass basic functionality testing, and it's just
real nice to finally see the same pattern in all contexts. :^)

											
										
										
											2021-11-07 23:51:39 +00:00
+								    return {};
-												Kernel: Refactor TCP/IP stack

This has several significant changes to the networking stack.

* Significant refactoring of the TCP state machine. Right now it's
  probably more fragile than it used to be, but handles quite a lot
  more of the handshake process.
* `TCPSocket` holds a `NetworkAdapter*`, assigned during `connect()` or
  `bind()`, whichever comes first.
* `listen()` is now virtual in `Socket` and intended to be implemented
  in its child classes
* `listen()` no longer works without `bind()` - this is a bit of a
  regression, but listening sockets didn't work at all before, so it's
  not possible to observe the regression.
* A file is exposed at `/proc/net_tcp`, which is a JSON document listing
  the current TCP sockets with a bit of metadata.
* There's an `ETHERNET_VERY_DEBUG` flag for dumping packet's content out
  to `kprintf`. It is, indeed, _very debug_.

											
										
										
											2019-08-06 13:40:38 +00:00
+								}
-												Kernel: Remove the Socket::{protocol,}connect ShouldBlock argument

This argument is always set to description.is_blocking(), but
description is also given as a separate argument, so there's no point
to piping it through separately.

											
										
										
											2022-07-13 06:31:24 +00:00
+								ErrorOr<void> TCPSocket::protocol_connect(OpenFileDescription& description)
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								{
-												Kernel: Rename Socket::lock() => Socket::mutex()

"lock" is ambiguous (verb vs noun) while "mutex" is not.

											
										
										
											2021-08-29 11:10:55 +00:00
+								    MutexLocker locker(mutex());
-												IPv4: Take the socket lock more (fixes TCP connection to localhost)

This fixes an issue where making a TCP connection to localhost didn't
work correctly since the loopback interface is currently synchronous.
(Sending something to localhost would enqueue a packet on the same
interface and then immediately wake the network task to process that
packet.)

This was preventing the TCP handshake from working correctly with
localhost since we'd send out the SYN packet before moving to the
SynSent state. The lock is now held long enough for this operation
to be atomic.

											
										
										
											2020-10-21 18:51:02 +00:00
-												Kernel: Add simple ARP routing layer

This replaces the previous placeholder routing layer with a real one!
It's still very primitive, doesn't deal with things like timeouts very
well, and will probably need several more iterations to support more
normal networking things.

I haven't confirmed that this works with anything other than the QEMU
user networking layer, but I suspect that's what nearly everybody is
using at this point, so that's the important target to keep working.

											
										
										
											2019-08-28 11:58:01 +00:00
+								    auto routing_decision = route_to(peer_address(), local_address());
-												Kernel: Implement is_zero for RoutingDecision

											
										
										
											2019-08-29 01:18:38 +00:00
+								    if (routing_decision.is_zero())
-												Kernel: Add so_error to keep track of the socket's error state

This sets the m_so_error variable every time the socket returns an
error.

											
										
										
											2021-08-01 15:27:23 +00:00
+								        return set_so_error(EHOSTUNREACH);
-												Kernel: Add simple ARP routing layer

This replaces the previous placeholder routing layer with a real one!
It's still very primitive, doesn't deal with things like timeouts very
well, and will probably need several more iterations to support more
normal networking things.

I haven't confirmed that this works with anything other than the QEMU
user networking layer, but I suspect that's what nearly everybody is
using at this point, so that's the important target to keep working.

											
										
										
											2019-08-28 11:58:01 +00:00
+								    if (!has_specific_local_address())
 								        set_local_address(routing_decision.adapter->ipv4_address());
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
-												Kernel/Net: Rework ephemeral port allocation

Currently, ephemeral port allocation is handled by the
allocate_local_port_if_needed() and protocol_allocate_local_port()
methods. Actually binding the socket to an address (which means
inserting the socket/address pair into a global map) is performed either
in protocol_allocate_local_port() (for ephemeral ports) or in
protocol_listen() (for non-ephemeral ports); the latter will fail with
EADDRINUSE if the address is already used by an existing pair present in
the map.

There used to be a bug where for listen() without an explicit bind(),
the port allocation would conflict with itself: first an ephemeral port
would get allocated and inserted into the map, and then
protocol_listen() would check again for the port being free, find the
just-created map entry, and error out. This was fixed in commit
01e5af487f9513696dbcacab15d3e0036446f586 by passing an additional flag
did_allocate_port into protocol_listen() which specifies whether the
port was just allocated, and skipping the check in protocol_listen() if
the flag is set.

However, this only helps if the socket is bound to an ephemeral port
inside of this very listen() call. But calling bind(sin_port = 0) from
userspace should succeed and bind to an allocated ephemeral port, in the
same was as using an unbound socket for connect() does. The port number
can then be retrieved from userspace by calling getsockname (), and it
should be possible to either connect() or listen() on this socket,
keeping the allocated port number. Also, calling bind() when already
bound (either explicitly or implicitly) should always result in EINVAL.

To untangle this, introduce an explicit m_bound state in IPv4Socket,
just like LocalSocket has already. Once a socket is bound, further
attempt to bind it fail. Some operations cause the socket to implicitly
get bound to an (ephemeral) address; this is implemented by the new
ensure_bound() method. The protocol_allocate_local_port() method is
gone; it is now up to a protocol to assign a port to the socket inside
protocol_bind() if it finds that the socket has local_port() == 0.

protocol_bind() is now called in more cases, such as inside listen() if
the socket wasn't bound before that.

											
										
										
											2023-07-23 12:43:45 +00:00
+								    TRY(ensure_bound());
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
-												IPv4: Randomize the first TCP sequence number

Fixes #185.

											
										
										
											2020-01-08 15:03:01 +00:00
+								    m_sequence_number = get_good_random<u32>();
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								    m_ack_number = 0;
-												Kernel: Use a more detailed state machine for socket setup

											
										
										
											2019-08-10 03:17:00 +00:00
+								    set_setup_state(SetupState::InProgress);
-												Kernel: Use TRY() in TCPSocket

											
										
										
											2021-09-05 13:48:40 +00:00
+								    TRY(send_tcp_packet(TCPFlags::SYN));
-												Kernel: Refactor TCP/IP stack

This has several significant changes to the networking stack.

* Significant refactoring of the TCP state machine. Right now it's
  probably more fragile than it used to be, but handles quite a lot
  more of the handshake process.
* `TCPSocket` holds a `NetworkAdapter*`, assigned during `connect()` or
  `bind()`, whichever comes first.
* `listen()` is now virtual in `Socket` and intended to be implemented
  in its child classes
* `listen()` no longer works without `bind()` - this is a bit of a
  regression, but listening sockets didn't work at all before, so it's
  not possible to observe the regression.
* A file is exposed at `/proc/net_tcp`, which is a JSON document listing
  the current TCP sockets with a bit of metadata.
* There's an `ETHERNET_VERY_DEBUG` flag for dumping packet's content out
  to `kprintf`. It is, indeed, _very debug_.

											
										
										
											2019-08-06 13:40:38 +00:00
+								    m_state = State::SynSent;
-												Kernel: Add Socket::set_role() and use it everywhere

Instead of having Socket subclasses write their role into Socket::m_role
directly, add a setter to do this.

											
										
										
											2021-08-29 00:04:30 +00:00
+								    set_role(Role::Connecting);
-												Kernel: Implement TCP listening sockets and incoming connections

											
										
										
											2019-08-09 02:48:28 +00:00
+								    m_direction = Direction::Outgoing;
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
-												Kernel: Move block condition evaluation out of the Scheduler

This makes the Scheduler a lot leaner by not having to evaluate
block conditions every time it is invoked. Instead evaluate them as
the states change, and unblock threads at that point.

This also implements some more waitid/waitpid/wait features and
behavior. For example, WUNTRACED and WNOWAIT are now supported. And
wait will now not return EINTR when SIGCHLD is delivered at the
same time.

											
										
										
											2020-11-29 23:05:27 +00:00
+								    evaluate_block_conditions();
-												Kernel: Remove the Socket::{protocol,}connect ShouldBlock argument

This argument is always set to description.is_blocking(), but
description is also given as a separate argument, so there's no point
to piping it through separately.

											
										
										
											2022-07-13 06:31:24 +00:00
+								    if (description.is_blocking()) {
-												IPv4: Take the socket lock more (fixes TCP connection to localhost)

This fixes an issue where making a TCP connection to localhost didn't
work correctly since the loopback interface is currently synchronous.
(Sending something to localhost would enqueue a packet on the same
interface and then immediately wake the network task to process that
packet.)

This was preventing the TCP handshake from working correctly with
localhost since we'd send out the SYN packet before moving to the
SynSent state. The lock is now held long enough for this operation
to be atomic.

											
										
										
											2020-10-21 18:51:02 +00:00
+								        locker.unlock();
-												Kernel: Move block condition evaluation out of the Scheduler

This makes the Scheduler a lot leaner by not having to evaluate
block conditions every time it is invoked. Instead evaluate them as
the states change, and unblock threads at that point.

This also implements some more waitid/waitpid/wait features and
behavior. For example, WUNTRACED and WNOWAIT are now supported. And
wait will now not return EINTR when SIGCHLD is delivered at the
same time.

											
										
										
											2020-11-29 23:05:27 +00:00
+								        auto unblock_flags = Thread::FileBlocker::BlockFlags::None;
-												AK: Simplify constructors and conversions from nullptr_t

Problem:
- Many constructors are defined as `{}` rather than using the ` =
  default` compiler-provided constructor.
- Some types provide an implicit conversion operator from `nullptr_t`
  instead of requiring the caller to default construct. This violates
  the C++ Core Guidelines suggestion to declare single-argument
  constructors explicit
  (https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#c46-by-default-declare-single-argument-constructors-explicit).

Solution:
- Change default constructors to use the compiler-provided default
  constructor.
- Remove implicit conversion operators from `nullptr_t` and change
  usage to enforce type consistency without conversion.

											
										
										
											2021-01-10 23:29:28 +00:00
+								        if (Thread::current()->block<Thread::ConnectBlocker>({}, description, unblock_flags).was_interrupted())
-												Kernel: Add so_error to keep track of the socket's error state

This sets the m_so_error variable every time the socket returns an
error.

											
										
										
											2021-08-01 15:27:23 +00:00
+								            return set_so_error(EINTR);
-												IPv4: Take the socket lock more (fixes TCP connection to localhost)

This fixes an issue where making a TCP connection to localhost didn't
work correctly since the loopback interface is currently synchronous.
(Sending something to localhost would enqueue a packet on the same
interface and then immediately wake the network task to process that
packet.)

This was preventing the TCP handshake from working correctly with
localhost since we'd send out the SYN packet before moving to the
SynSent state. The lock is now held long enough for this operation
to be atomic.

											
										
										
											2020-10-21 18:51:02 +00:00
+								        locker.lock();
-												Everywhere: Rename ASSERT => VERIFY

(...and ASSERT_NOT_REACHED => VERIFY_NOT_REACHED)

Since all of these checks are done in release builds as well,
let's rename them to VERIFY to prevent confusion, as everyone is
used to assertions being compiled out in release.

We can introduce a new ASSERT macro that is specifically for debug
checks, but I'm doing this wholesale conversion first since we've
accumulated thousands of these already, and it's not immediately
obvious which ones are suitable for ASSERT.

											
										
										
											2021-02-23 19:42:32 +00:00
+								        VERIFY(setup_state() == SetupState::Completed);
-												Kernel: Move block condition evaluation out of the Scheduler

This makes the Scheduler a lot leaner by not having to evaluate
block conditions every time it is invoked. Instead evaluate them as
the states change, and unblock threads at that point.

This also implements some more waitid/waitpid/wait features and
behavior. For example, WUNTRACED and WNOWAIT are now supported. And
wait will now not return EINTR when SIGCHLD is delivered at the
same time.

											
										
										
											2020-11-29 23:05:27 +00:00
+								        if (has_error()) { // TODO: check unblock_flags
-												Kernel: Add Socket::set_role() and use it everywhere

Instead of having Socket subclasses write their role into Socket::m_role
directly, add a setter to do this.

											
										
										
											2021-08-29 00:04:30 +00:00
+								            set_role(Role::None);
-												Kernel: Try to retransmit lost TCP packets

Previously we didn't retransmit lost TCP packets which would cause
connections to hang if packets were lost. Also we now time out
TCP connections after a number of retransmission attempts.

											
										
										
											2021-05-13 08:49:10 +00:00
+								            if (error() == TCPSocket::Error::RetransmitTimeout)
-												Kernel: Add so_error to keep track of the socket's error state

This sets the m_so_error variable every time the socket returns an
error.

											
										
										
											2021-08-01 15:27:23 +00:00
+								                return set_so_error(ETIMEDOUT);
-												Kernel: Try to retransmit lost TCP packets

Previously we didn't retransmit lost TCP packets which would cause
connections to hang if packets were lost. Also we now time out
TCP connections after a number of retransmission attempts.

											
										
										
											2021-05-13 08:49:10 +00:00
+								            else
-												Kernel: Add so_error to keep track of the socket's error state

This sets the m_so_error variable every time the socket returns an
error.

											
										
										
											2021-08-01 15:27:23 +00:00
+								                return set_so_error(ECONNREFUSED);
-												Kernel: Move socket role tracking to the Socket class itself

This is more logical and allows us to solve the problem of
non-blocking TCP sockets getting stuck in SocketRole::None.

The only complication is that a single LocalSocket may be shared
between two file descriptions (on the connect and accept sides),
and should have two different roles depending from which side
you look at it. To deal with it, Socket::role() is made a
virtual method that accepts a file description, and LocalSocket
internally tracks which FileDescription is the which one and
returns a correct role.

											
										
										
											2019-08-11 13:38:20 +00:00
+								        }
-												Kernel: Replace KResult and KResultOr<T> with Error and ErrorOr<T>

We now use AK::Error and AK::ErrorOr<T> in both kernel and userspace!
This was a slightly tedious refactoring that took a long time, so it's
not unlikely that some bugs crept in.

Nevertheless, it does pass basic functionality testing, and it's just
real nice to finally see the same pattern in all contexts. :^)

											
										
										
											2021-11-07 23:51:39 +00:00
+								        return {};
-												Kernel: Support non-blocking connect().

If connect() is called on a non-blocking socket, it will "fail" immediately
with -EINPROGRESS. After that, you select() on the socket and wait for it to
become writable.

											
										
										
											2019-04-08 02:52:21 +00:00
+								    }
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
-												Kernel: Add so_error to keep track of the socket's error state

This sets the m_so_error variable every time the socket returns an
error.

											
										
										
											2021-08-01 15:27:23 +00:00
+								    return set_so_error(EINPROGRESS);
-												Kernel: Factor out TCP parts of IPv4Socket into a TCPSocket class.

											
										
										
											2019-03-14 11:20:38 +00:00
+								}
-												IPv4: Move more stuff from IPv4Socket to TCPSocket.

											
										
										
											2019-03-14 11:28:30 +00:00
-												TCP: Start working on auto-closing connections when we get FIN.

											
										
										
											2019-03-14 14:23:32 +00:00
+								bool TCPSocket::protocol_is_disconnected() const
 								{
-												Kernel: Refactor TCP/IP stack

This has several significant changes to the networking stack.

* Significant refactoring of the TCP state machine. Right now it's
  probably more fragile than it used to be, but handles quite a lot
  more of the handshake process.
* `TCPSocket` holds a `NetworkAdapter*`, assigned during `connect()` or
  `bind()`, whichever comes first.
* `listen()` is now virtual in `Socket` and intended to be implemented
  in its child classes
* `listen()` no longer works without `bind()` - this is a bit of a
  regression, but listening sockets didn't work at all before, so it's
  not possible to observe the regression.
* A file is exposed at `/proc/net_tcp`, which is a JSON document listing
  the current TCP sockets with a bit of metadata.
* There's an `ETHERNET_VERY_DEBUG` flag for dumping packet's content out
  to `kprintf`. It is, indeed, _very debug_.

											
										
										
											2019-08-06 13:40:38 +00:00
+								    switch (m_state) {
 								    case State::Closed:
 								    case State::CloseWait:
 								    case State::LastAck:
 								    case State::FinWait1:
 								    case State::FinWait2:
 								    case State::Closing:
 								    case State::TimeWait:
 								        return true;
 								    default:
 								        return false;
 								    }
-												IPv4: Implement bind() for TCP and UDP sockets.

We can't accept connections just yet, but this patch makes it possible to
bind() to a given source address/port.

											
										
										
											2019-05-03 19:51:40 +00:00
+								}
-												IPv4: Basic implementation of TCP socket shutdown

We can now participate in the TCP connection closing handshake. :^)
This implementation is definitely not complete and needs to handle a
bunch of other cases. But it's a huge improvement over not being able
to close connections at all.

Note that we hold on to pending-close sockets indefinitely, until they
are moved into the Closed state. This should also have a timeout but
that's still a FIXME. :^)

Fixes #428.

											
										
										
											2020-02-08 14:52:32 +00:00
 								void TCPSocket::shut_down_for_writing()
 								{
 								    if (state() == State::Established) {
-												Kernel: Send only FIN when shutting down TCP socket from ESTABLISHED

We were previously sending FIN|ACK for some reason.

											
										
										
											2022-02-06 16:28:58 +00:00
+								        dbgln_if(TCP_SOCKET_DEBUG, " Sending FIN from Established and moving into FinWait1");
-												Kernet/Net: Close a TCP connection using FIN|ACK instead of just FIN

When initiating a connection termination, the FIN should be sent with
a ACK from the last received segment even if that ACK already been sent.

											
										
										
											2023-06-28 22:40:44 +00:00
+								        (void)send_tcp_packet(TCPFlags::FIN | TCPFlags::ACK);
-												IPv4: Basic implementation of TCP socket shutdown

We can now participate in the TCP connection closing handshake. :^)
This implementation is definitely not complete and needs to handle a
bunch of other cases. But it's a huge improvement over not being able
to close connections at all.

Note that we hold on to pending-close sockets indefinitely, until they
are moved into the Closed state. This should also have a timeout but
that's still a FIXME. :^)

Fixes #428.

											
										
										
											2020-02-08 14:52:32 +00:00
+								        set_state(State::FinWait1);
 								    } else {
-												Everywhere: Replace a bundle of dbg with dbgln.

These changes are arbitrarily divided into multiple commits to make it
easier to find potentially introduced bugs with git bisect.Everything:

											
										
										
											2021-01-10 14:43:09 +00:00
+								        dbgln(" Shutting down TCPSocket for writing but not moving to FinWait1 since state is {}", to_string(state()));
-												IPv4: Basic implementation of TCP socket shutdown

We can now participate in the TCP connection closing handshake. :^)
This implementation is definitely not complete and needs to handle a
bunch of other cases. But it's a huge improvement over not being able
to close connections at all.

Note that we hold on to pending-close sockets indefinitely, until they
are moved into the Closed state. This should also have a timeout but
that's still a FIXME. :^)

Fixes #428.

											
										
										
											2020-02-08 14:52:32 +00:00
+								    }
 								}
-												Kernel: Replace KResult and KResultOr<T> with Error and ErrorOr<T>

We now use AK::Error and AK::ErrorOr<T> in both kernel and userspace!
This was a slightly tedious refactoring that took a long time, so it's
not unlikely that some bugs crept in.

Nevertheless, it does pass basic functionality testing, and it's just
real nice to finally see the same pattern in all contexts. :^)

											
										
										
											2021-11-07 23:51:39 +00:00
+								ErrorOr<void> TCPSocket::close()
-												IPv4: Basic implementation of TCP socket shutdown

We can now participate in the TCP connection closing handshake. :^)
This implementation is definitely not complete and needs to handle a
bunch of other cases. But it's a huge improvement over not being able
to close connections at all.

Note that we hold on to pending-close sockets indefinitely, until they
are moved into the Closed state. This should also have a timeout but
that's still a FIXME. :^)

Fixes #428.

											
										
										
											2020-02-08 14:52:32 +00:00
+								{
-												Kernel: Rename Socket::lock() => Socket::mutex()

"lock" is ambiguous (verb vs noun) while "mutex" is not.

											
										
										
											2021-08-29 11:10:55 +00:00
+								    MutexLocker locker(mutex());
-												Kernel: Allow File::close() to fail

And pass the result through to sys$close() return value.

Fixes https://github.com/SerenityOS/serenity/issues/427

											
										
										
											2020-06-02 16:20:05 +00:00
+								    auto result = IPv4Socket::close();
-												IPv4: Basic implementation of TCP socket shutdown

We can now participate in the TCP connection closing handshake. :^)
This implementation is definitely not complete and needs to handle a
bunch of other cases. But it's a huge improvement over not being able
to close connections at all.

Note that we hold on to pending-close sockets indefinitely, until they
are moved into the Closed state. This should also have a timeout but
that's still a FIXME. :^)

Fixes #428.

											
										
										
											2020-02-08 14:52:32 +00:00
+								    if (state() == State::CloseWait) {
-												Everywhere: Turn #if *_DEBUG into dbgln_if/if constexpr

											
										
										
											2021-05-01 19:10:08 +00:00
+								        dbgln_if(TCP_SOCKET_DEBUG, " Sending FIN from CloseWait and moving into LastAck");
-												Everywhere: Switch from (void) to [[maybe_unused]] (#4473)

Problem:
- `(void)` simply casts the expression to void. This is understood to
  indicate that it is ignored, but this is really a compiler trick to
  get the compiler to not generate a warning.

Solution:
- Use the `[[maybe_unused]]` attribute to indicate the value is unused.

Note:
- Functions taking a `(void)` argument list have also been changed to
  `()` because this is not needed and shows up in the same grep
  command.
											
										
										
											2020-12-20 23:09:48 +00:00
+								        [[maybe_unused]] auto rc = send_tcp_packet(TCPFlags::FIN | TCPFlags::ACK);
-												IPv4: Basic implementation of TCP socket shutdown

We can now participate in the TCP connection closing handshake. :^)
This implementation is definitely not complete and needs to handle a
bunch of other cases. But it's a huge improvement over not being able
to close connections at all.

Note that we hold on to pending-close sockets indefinitely, until they
are moved into the Closed state. This should also have a timeout but
that's still a FIXME. :^)

Fixes #428.

											
										
										
											2020-02-08 14:52:32 +00:00
+								        set_state(State::LastAck);
 								    }
-												Kernel: Migrate TCP socket tables locking to ProtectedValue

Note: TCPSocket::create_client() has a dubious locking process where
the sockets by tuple table is first shared lock to check if the socket
exists and bail out if it does, then unlocks, then exclusively locks to
add the tuple. There could be a race condition where two client
creation requests for the same tuple happen at the same time and both
cleared the shared lock check. When in doubt, lock exclusively the
whole time.

											
										
										
											2021-07-18 10:14:43 +00:00
+								    if (state() != State::Closed && state() != State::Listen)
 								        closing_sockets().with_exclusive([&](auto& table) {
 								            table.set(tuple(), *this);
 								        });
-												Kernel: Allow File::close() to fail

And pass the result through to sys$close() return value.

Fixes https://github.com/SerenityOS/serenity/issues/427

											
										
										
											2020-06-02 16:20:05 +00:00
+								    return result;
-												IPv4: Basic implementation of TCP socket shutdown

We can now participate in the TCP connection closing handshake. :^)
This implementation is definitely not complete and needs to handle a
bunch of other cases. But it's a huge improvement over not being able
to close connections at all.

Note that we hold on to pending-close sockets indefinitely, until they
are moved into the Closed state. This should also have a timeout but
that's still a FIXME. :^)

Fixes #428.

											
										
										
											2020-02-08 14:52:32 +00:00
+								}
-												Kernel: Move all code into the Kernel namespace

											
										
										
											2020-02-16 00:27:42 +00:00
-												Kernel: Rename ProtectedValue<T> => MutexProtected<T>

Let's make it obvious what we're protecting it with.

											
										
										
											2021-08-21 21:31:15 +00:00
+								static Singleton<MutexProtected<TCPSocket::RetransmitList>> s_sockets_for_retransmit;
-												Kernel: Try to retransmit lost TCP packets

Previously we didn't retransmit lost TCP packets which would cause
connections to hang if packets were lost. Also we now time out
TCP connections after a number of retransmission attempts.

											
										
										
											2021-05-13 08:49:10 +00:00
-												Kernel: Rename ProtectedValue<T> => MutexProtected<T>

Let's make it obvious what we're protecting it with.

											
										
										
											2021-08-21 21:31:15 +00:00
+								MutexProtected<TCPSocket::RetransmitList>& TCPSocket::sockets_for_retransmit()
-												Kernel: Try to retransmit lost TCP packets

Previously we didn't retransmit lost TCP packets which would cause
connections to hang if packets were lost. Also we now time out
TCP connections after a number of retransmission attempts.

											
										
										
											2021-05-13 08:49:10 +00:00
+								{
 								    return *s_sockets_for_retransmit;
 								}
 								void TCPSocket::enqueue_for_retransmit()
 								{
-												Kernel: Convert TCP retransmit queue from HashTable to IntrusiveList

											
										
										
											2021-08-15 14:37:45 +00:00
+								    sockets_for_retransmit().with_exclusive([&](auto& list) {
 								        list.append(*this);
-												Kernel: Migrate TCP socket tables locking to ProtectedValue

Note: TCPSocket::create_client() has a dubious locking process where
the sockets by tuple table is first shared lock to check if the socket
exists and bail out if it does, then unlocks, then exclusively locks to
add the tuple. There could be a race condition where two client
creation requests for the same tuple happen at the same time and both
cleared the shared lock check. When in doubt, lock exclusively the
whole time.

											
										
										
											2021-07-18 10:14:43 +00:00
+								    });
-												Kernel: Try to retransmit lost TCP packets

Previously we didn't retransmit lost TCP packets which would cause
connections to hang if packets were lost. Also we now time out
TCP connections after a number of retransmission attempts.

											
										
										
											2021-05-13 08:49:10 +00:00
+								}
 								void TCPSocket::dequeue_for_retransmit()
 								{
-												Kernel: Convert TCP retransmit queue from HashTable to IntrusiveList

											
										
										
											2021-08-15 14:37:45 +00:00
+								    sockets_for_retransmit().with_exclusive([&](auto& list) {
 								        list.remove(*this);
-												Kernel: Migrate TCP socket tables locking to ProtectedValue

Note: TCPSocket::create_client() has a dubious locking process where
the sockets by tuple table is first shared lock to check if the socket
exists and bail out if it does, then unlocks, then exclusively locks to
add the tuple. There could be a race condition where two client
creation requests for the same tuple happen at the same time and both
cleared the shared lock check. When in doubt, lock exclusively the
whole time.

											
										
										
											2021-07-18 10:14:43 +00:00
+								    });
-												Kernel: Try to retransmit lost TCP packets

Previously we didn't retransmit lost TCP packets which would cause
connections to hang if packets were lost. Also we now time out
TCP connections after a number of retransmission attempts.

											
										
										
											2021-05-13 08:49:10 +00:00
+								}
 								void TCPSocket::retransmit_packets()
 								{
-												Kernel/Net: Use monotonic time for TCP times

These were using real time as a mistake before; changing the system time
during ongoing TCP connections shouldn’t break them.

											
										
										
											2023-08-17 17:20:42 +00:00
+								    auto now = TimeManagement::the().monotonic_time();
-												Kernel: Try to retransmit lost TCP packets

Previously we didn't retransmit lost TCP packets which would cause
connections to hang if packets were lost. Also we now time out
TCP connections after a number of retransmission attempts.

											
										
										
											2021-05-13 08:49:10 +00:00
 								    // RFC6298 says we should have at least one second between retransmits. According to
 								    // RFC1122 we must do exponential backoff - even for SYN packets.
 								    i64 retransmit_interval = 1;
 								    for (decltype(m_retransmit_attempts) i = 0; i < m_retransmit_attempts; i++)
 								        retransmit_interval *= 2;
-												AK: Rename Time to Duration

That's what this class really is; in fact that's what the first line of
the comment says it is.

This commit does not rename the main files, since those will contain
other time-related classes in a little bit.

											
										
										
											2023-03-13 15:30:34 +00:00
+								    if (m_last_retransmit_time > now - Duration::from_seconds(retransmit_interval))
-												Kernel: Try to retransmit lost TCP packets

Previously we didn't retransmit lost TCP packets which would cause
connections to hang if packets were lost. Also we now time out
TCP connections after a number of retransmission attempts.

											
										
										
											2021-05-13 08:49:10 +00:00
+								        return;
 								    dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket({}) handling retransmit", this);
 								    m_last_retransmit_time = now;
 								    ++m_retransmit_attempts;
 								    if (m_retransmit_attempts > maximum_retransmits) {
 								        set_state(TCPSocket::State::Closed);
 								        set_error(TCPSocket::Error::RetransmitTimeout);
 								        set_setup_state(Socket::SetupState::Completed);
 								        return;
 								    }
-												Kernel/Net: Iron out the locking mechanism across the subsystem

There is a big mix of LockRefPtrs all over the Networking subsystem, as
well as lots of room for improvements with our locking patterns, which
this commit will not pursue, but will give a good start for such work.

To deal with this situation, we change the following things:
- Creating instances of NetworkAdapter should always yield a non-locking
  NonnullRefPtr. Acquiring an instance from the NetworkingManagement
  should give a simple RefPtr,as giving LockRefPtr does not really
  protect from concurrency problems in such case.
- Since NetworkingManagement works with normal RefPtrs we should
  protect all instances of RefPtr<NetworkAdapter> with SpinlockProtected
  to ensure references are gone unexpectedly.
- Protect the so_error class member with a proper spinlock. This happens
  to be important because the clear_so_error() method lacked any proper
  locking measures. It also helps preventing a possible TOCTOU when we
  might do a more fine-grained locking in the Socket code, so this could
  be definitely a start for this.
- Change unnecessary LockRefPtr<PacketWithTimestamp> in the structure
  of OutgoingPacket to a simple RefPtr<PacketWithTimestamp> as the whole
  list should be MutexProtected.

											
										
										
											2023-04-11 00:50:15 +00:00
+								    auto adapter = bound_interface().with([](auto& bound_device) -> RefPtr<NetworkAdapter> { return bound_device; });
 								    auto routing_decision = route_to(peer_address(), local_address(), adapter);
-												Kernel: Merge do_retransmit_packets() into retransmit_packets()

											
										
										
											2021-05-13 09:34:38 +00:00
+								    if (routing_decision.is_zero())
 								        return;
-												Kernel/TCP: Port TCP retransmit queue to ProtectedValue

I had to switch to exclusive locking since ProtectedValue rightly
doesn't allow you to mutate protected data with only a shared lock.

											
										
										
											2021-08-07 13:42:11 +00:00
+								    m_unacked_packets.with_exclusive([&](auto& unacked_packets) {
 								        for (auto& packet : unacked_packets.packets) {
 								            packet.tx_counter++;
 								            if constexpr (TCP_SOCKET_DEBUG) {
 								                auto& tcp_packet = *(const TCPPacket*)(packet.buffer->buffer->data() + packet.ipv4_payload_offset);
 								                dbgln("Sending TCP packet from {}:{} to {}:{} with ({}{}{}{}) seq_no={}, ack_no={}, tx_counter={}",
 								                    local_address(), local_port(),
 								                    peer_address(), peer_port(),
 								                    (tcp_packet.has_syn() ? "SYN " : ""),
 								                    (tcp_packet.has_ack() ? "ACK " : ""),
 								                    (tcp_packet.has_fin() ? "FIN " : ""),
 								                    (tcp_packet.has_rst() ? "RST " : ""),
 								                    tcp_packet.sequence_number(),
 								                    tcp_packet.ack_number(),
 								                    packet.tx_counter);
 								            }
-												Kernel: Merge do_retransmit_packets() into retransmit_packets()

											
										
										
											2021-05-13 09:34:38 +00:00
-												Kernel/TCP: Port TCP retransmit queue to ProtectedValue

I had to switch to exclusive locking since ProtectedValue rightly
doesn't allow you to mutate protected data with only a shared lock.

											
										
										
											2021-08-07 13:42:11 +00:00
+								            size_t ipv4_payload_offset = routing_decision.adapter->ipv4_payload_offset();
 								            if (ipv4_payload_offset != packet.ipv4_payload_offset) {
 								                // FIXME: Add support for this. This can happen if after a route change
 								                // we ended up on another adapter which doesn't have the same layer 2 type
 								                // like the previous adapter.
 								                VERIFY_NOT_REACHED();
 								            }
-												Kernel: Handle OOM when allocating Packet KBuffers

											
										
										
											2021-08-01 12:11:49 +00:00
-												Kernel/TCP: Port TCP retransmit queue to ProtectedValue

I had to switch to exclusive locking since ProtectedValue rightly
doesn't allow you to mutate protected data with only a shared lock.

											
										
										
											2021-08-07 13:42:11 +00:00
+								            auto packet_buffer = packet.buffer->bytes();
-												Kernel: Handle OOM when allocating Packet KBuffers

											
										
										
											2021-08-01 12:11:49 +00:00
-												Kernel/TCP: Port TCP retransmit queue to ProtectedValue

I had to switch to exclusive locking since ProtectedValue rightly
doesn't allow you to mutate protected data with only a shared lock.

											
										
										
											2021-08-07 13:42:11 +00:00
+								            routing_decision.adapter->fill_in_ipv4_header(*packet.buffer,
 								                local_address(), routing_decision.next_hop, peer_address(),
-												Kernel+LibC: Add support for the IPv4 TOS field via the IP_TOS sockopt

											
										
										
											2021-10-27 20:20:24 +00:00
+								                IPv4Protocol::TCP, packet_buffer.size() - ipv4_payload_offset, type_of_service(), ttl());
-												Kernel/TCP: Port TCP retransmit queue to ProtectedValue

I had to switch to exclusive locking since ProtectedValue rightly
doesn't allow you to mutate protected data with only a shared lock.

											
										
										
											2021-08-07 13:42:11 +00:00
+								            routing_decision.adapter->send_packet(packet_buffer);
 								            m_packets_out++;
 								            m_bytes_out += packet_buffer.size();
 								        }
 								    });
-												Kernel: Try to retransmit lost TCP packets

Previously we didn't retransmit lost TCP packets which would cause
connections to hang if packets were lost. Also we now time out
TCP connections after a number of retransmission attempts.

											
										
										
											2021-05-13 08:49:10 +00:00
+								}
-												Everywhere: Run clang-format

											
										
										
											2022-04-01 17:58:27 +00:00
+								bool TCPSocket::can_write(OpenFileDescription const& file_description, u64 size) const
-												Kernel: Block when writing to TCP sockets when the send window is full

Previously we'd just dump those packets into the network adapter's
send queue and hope for the best. Instead we should wait until the peer
has sent TCP ACK packets.

Ideally this would parse the TCP window size option from the SYN or
SYN|ACK packet, but for now we just assume the window size is 64 kB.

											
										
										
											2021-05-26 04:26:20 +00:00
+								{
 								    if (!IPv4Socket::can_write(file_description, size))
 								        return false;
-												Kernel: Block writes while we're establishing the TCP connection

Previously we would not block the caller until the connection was
established and would instead return EPIPE for the first send() call
which then likely caused the caller to abandon the socket.

This was broken by 0625342.

											
										
										
											2021-06-11 06:43:17 +00:00
+								    if (m_state == State::SynSent || m_state == State::SynReceived)
 								        return false;
-												Kernel: Block when writing to TCP sockets when the send window is full

Previously we'd just dump those packets into the network adapter's
send queue and hope for the best. Instead we should wait until the peer
has sent TCP ACK packets.

Ideally this would parse the TCP window size option from the SYN or
SYN|ACK packet, but for now we just assume the window size is 64 kB.

											
										
										
											2021-05-26 04:26:20 +00:00
+								    if (!file_description.is_blocking())
 								        return true;
-												Kernel/TCP: Port TCP retransmit queue to ProtectedValue

I had to switch to exclusive locking since ProtectedValue rightly
doesn't allow you to mutate protected data with only a shared lock.

											
										
										
											2021-08-07 13:42:11 +00:00
+								    return m_unacked_packets.with_shared([&](auto& unacked_packets) {
 								        return unacked_packets.size + size <= m_send_window_size;
 								    });
-												Kernel: Block when writing to TCP sockets when the send window is full

Previously we'd just dump those packets into the network adapter's
send queue and hope for the best. Instead we should wait until the peer
has sent TCP ACK packets.

Ideally this would parse the TCP window size option from the SYN or
SYN|ACK packet, but for now we just assume the window size is 64 kB.

											
										
										
											2021-05-26 04:26:20 +00:00
+								}
-												Kernel: Move all code into the Kernel namespace

											
										
										
											2020-02-16 00:27:42 +00:00
+								}