serenity/Tests/AK/TestString.cpp

/*
 * Copyright (c) 2022, Andreas Kling <kling@serenityos.org>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <LibTest/TestCase.h>

#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <AK/Try.h>
#include <AK/Utf8View.h>
#include <AK/Vector.h>

TEST_CASE(construct_empty)
{
    String empty;
    EXPECT(empty.is_empty());
    EXPECT_EQ(empty.bytes().size(), 0u);

    auto empty2 = MUST(String::from_utf8(""sv));
    EXPECT(empty2.is_empty());
    EXPECT_EQ(empty, empty2);
    EXPECT_EQ(empty, ""sv);
}

TEST_CASE(move_assignment)
{
    String string1 = MUST(String::from_utf8("hello"sv));
    string1 = MUST(String::from_utf8("friends!"sv));
    EXPECT_EQ(string1, "friends!"sv);
}

TEST_CASE(short_strings)
{
#ifdef AK_ARCH_64_BIT
    auto string = MUST(String::from_utf8("abcdefg"sv));
    EXPECT_EQ(string.is_short_string(), true);
    EXPECT_EQ(string.bytes().size(), 7u);
    EXPECT_EQ(string.bytes_as_string_view(), "abcdefg"sv);
#else
    auto string = MUST(String::from_utf8("abc"sv));
    EXPECT_EQ(string.is_short_string(), true);
    EXPECT_EQ(string.bytes().size(), 3u);
    EXPECT_EQ(string.bytes_as_string_view(), "abc"sv);
#endif
}

TEST_CASE(long_strings)
{
    auto string = MUST(String::from_utf8("abcdefgh"sv));
    EXPECT_EQ(string.is_short_string(), false);
    EXPECT_EQ(string.bytes().size(), 8u);
    EXPECT_EQ(string.bytes_as_string_view(), "abcdefgh"sv);
}

TEST_CASE(substring)
{
    auto superstring = MUST(String::from_utf8("Hello I am a long string"sv));
    auto short_substring = MUST(superstring.substring_from_byte_offset(0, 5));
    EXPECT_EQ(short_substring, "Hello"sv);

    auto long_substring = MUST(superstring.substring_from_byte_offset(0, 10));
    EXPECT_EQ(long_substring, "Hello I am"sv);
}

TEST_CASE(code_points)
{
    auto string = MUST(String::from_utf8("🦬🪒"sv));

    Vector<u32> code_points;
    for (auto code_point : string.code_points())
        code_points.append(code_point);

    EXPECT_EQ(code_points[0], 0x1f9acu);
    EXPECT_EQ(code_points[1], 0x1fa92u);
}

TEST_CASE(string_builder)
{
    StringBuilder builder;
    builder.append_code_point(0x1f9acu);
    builder.append_code_point(0x1fa92u);

    auto string = MUST(builder.to_string());
    EXPECT_EQ(string, "🦬🪒"sv);
    EXPECT_EQ(string.bytes().size(), 8u);
}

TEST_CASE(ak_format)
{
    auto foo = MUST(String::formatted("Hello {}", MUST(String::from_utf8("friends"sv))));
    EXPECT_EQ(foo, "Hello friends"sv);
}

TEST_CASE(replace)
{
    {
        auto haystack = MUST(String::from_utf8("Hello enemies"sv));
        auto result = MUST(haystack.replace("enemies"sv, "friends"sv, ReplaceMode::All));
        EXPECT_EQ(result, "Hello friends"sv);
    }

    {
        auto base_title = MUST(String::from_utf8("anon@courage:~"sv));
        auto result = MUST(base_title.replace("[*]"sv, "(*)"sv, ReplaceMode::FirstOnly));
        EXPECT_EQ(result, "anon@courage:~"sv);
    }
}

TEST_CASE(to_lowercase)
{
    {
        auto string = MUST(String::from_utf8("Aa"sv));
        auto result = MUST(string.to_lowercase());
        EXPECT_EQ(result, "aa"sv);
    }
    {
        auto string = MUST(String::from_utf8("Ωω"sv));
        auto result = MUST(string.to_lowercase());
        EXPECT_EQ(result, "ωω"sv);
    }
    {
        auto string = MUST(String::from_utf8("İi̇"sv));
        auto result = MUST(string.to_lowercase());
        EXPECT_EQ(result, "i̇i̇"sv);
    }
}

TEST_CASE(to_uppercase)
{
    {
        auto string = MUST(String::from_utf8("Aa"sv));
        auto result = MUST(string.to_uppercase());
        EXPECT_EQ(result, "AA"sv);
    }
    {
        auto string = MUST(String::from_utf8("Ωω"sv));
        auto result = MUST(string.to_uppercase());
        EXPECT_EQ(result, "ΩΩ"sv);
    }
    {
        auto string = MUST(String::from_utf8("ŉ"sv));
        auto result = MUST(string.to_uppercase());
        EXPECT_EQ(result, "ʼN"sv);
    }
}
-												AK: Introduce the new String, replacement for DeprecatedString

DeprecatedString (formerly String) has been with us since the start,
and it has served us well. However, it has a number of shortcomings
that I'd like to address.

Some of these issues are hard if not impossible to solve incrementally
inside of DeprecatedString, so instead of doing that, let's build a new
String class and then incrementally move over to it instead.

Problems in DeprecatedString:

- It assumes string allocation never fails. This makes it impossible
  to use in allocation-sensitive contexts, and is the reason we had to
  ban DeprecatedString from the kernel entirely.

- The awkward null state. DeprecatedString can be null. It's different
  from the empty state, although null strings are considered empty.
  All code is immediately nicer when using Optional<DeprecatedString>
  but DeprecatedString came before Optional, which is how we ended up
  like this.

- The encoding of the underlying data is ambiguous. For the most part,
  we use it as if it's always UTF-8, but there have been cases where
  we pass around strings in other encodings (e.g ISO8859-1)

- operator[] and length() are used to iterate over DeprecatedString one
  byte at a time. This is done all over the codebase, and will *not*
  give the right results unless the string is all ASCII.

How we solve these issues in the new String:

- Functions that may allocate now return ErrorOr<String> so that ENOMEM
  errors can be passed to the caller.

- String has no null state. Use Optional<String> when needed.

- String is always UTF-8. This is validated when constructing a String.
  We may need to add a bypass for this in the future, for cases where
  you have a known-good string, but for now: validate all the things!

- There is no operator[] or length(). You can get the underlying data
  with bytes(), but for iterating over code points, you should be using
  an UTF-8 iterator.

Furthermore, it has two nifty new features:

- String implements a small string optimization (SSO) for strings that
  can fit entirely within a pointer. This means up to 3 bytes on 32-bit
  platforms, and 7 bytes on 64-bit platforms. Such small strings will
  not be heap-allocated.

- String can create substrings without making a deep copy of the
  substring. Instead, the superstring gets +1 refcount from the
  substring, and it acts like a view into the superstring. To make
  substrings like this, use the substring_with_shared_superstring() API.

One caveat:

- String does not guarantee that the underlying data is null-terminated
  like DeprecatedString does today. While this was nifty in a handful of
  places where we were calling C functions, it did stand in the way of
  shared-superstring substrings.

											
										
										
											2022-12-01 12:27:43 +00:00
+								/*
 								 * Copyright (c) 2022, Andreas Kling <kling@serenityos.org>
 								 *
 								 * SPDX-License-Identifier: BSD-2-Clause
 								 */
 								#include <LibTest/TestCase.h>
 								#include <AK/String.h>
 								#include <AK/StringBuilder.h>
 								#include <AK/Try.h>
 								#include <AK/Utf8View.h>
 								#include <AK/Vector.h>
 								TEST_CASE(construct_empty)
 								{
 								    String empty;
 								    EXPECT(empty.is_empty());
 								    EXPECT_EQ(empty.bytes().size(), 0u);
 								    auto empty2 = MUST(String::from_utf8(""sv));
 								    EXPECT(empty2.is_empty());
 								    EXPECT_EQ(empty, empty2);
 								    EXPECT_EQ(empty, ""sv);
 								}
-												AK: Unref old m_data in String's move assignment

We were overridding the data pointer without unreffing it,
causing a memory leak when assigning a String.

											
										
										
											2022-12-08 17:30:04 +00:00
+								TEST_CASE(move_assignment)
 								{
 								    String string1 = MUST(String::from_utf8("hello"sv));
 								    string1 = MUST(String::from_utf8("friends!"sv));
 								    EXPECT_EQ(string1, "friends!"sv);
 								}
-												AK: Introduce the new String, replacement for DeprecatedString

DeprecatedString (formerly String) has been with us since the start,
and it has served us well. However, it has a number of shortcomings
that I'd like to address.

Some of these issues are hard if not impossible to solve incrementally
inside of DeprecatedString, so instead of doing that, let's build a new
String class and then incrementally move over to it instead.

Problems in DeprecatedString:

- It assumes string allocation never fails. This makes it impossible
  to use in allocation-sensitive contexts, and is the reason we had to
  ban DeprecatedString from the kernel entirely.

- The awkward null state. DeprecatedString can be null. It's different
  from the empty state, although null strings are considered empty.
  All code is immediately nicer when using Optional<DeprecatedString>
  but DeprecatedString came before Optional, which is how we ended up
  like this.

- The encoding of the underlying data is ambiguous. For the most part,
  we use it as if it's always UTF-8, but there have been cases where
  we pass around strings in other encodings (e.g ISO8859-1)

- operator[] and length() are used to iterate over DeprecatedString one
  byte at a time. This is done all over the codebase, and will *not*
  give the right results unless the string is all ASCII.

How we solve these issues in the new String:

- Functions that may allocate now return ErrorOr<String> so that ENOMEM
  errors can be passed to the caller.

- String has no null state. Use Optional<String> when needed.

- String is always UTF-8. This is validated when constructing a String.
  We may need to add a bypass for this in the future, for cases where
  you have a known-good string, but for now: validate all the things!

- There is no operator[] or length(). You can get the underlying data
  with bytes(), but for iterating over code points, you should be using
  an UTF-8 iterator.

Furthermore, it has two nifty new features:

- String implements a small string optimization (SSO) for strings that
  can fit entirely within a pointer. This means up to 3 bytes on 32-bit
  platforms, and 7 bytes on 64-bit platforms. Such small strings will
  not be heap-allocated.

- String can create substrings without making a deep copy of the
  substring. Instead, the superstring gets +1 refcount from the
  substring, and it acts like a view into the superstring. To make
  substrings like this, use the substring_with_shared_superstring() API.

One caveat:

- String does not guarantee that the underlying data is null-terminated
  like DeprecatedString does today. While this was nifty in a handful of
  places where we were calling C functions, it did stand in the way of
  shared-superstring substrings.

											
										
										
											2022-12-01 12:27:43 +00:00
+								TEST_CASE(short_strings)
 								{
 								#ifdef AK_ARCH_64_BIT
 								    auto string = MUST(String::from_utf8("abcdefg"sv));
 								    EXPECT_EQ(string.is_short_string(), true);
 								    EXPECT_EQ(string.bytes().size(), 7u);
 								    EXPECT_EQ(string.bytes_as_string_view(), "abcdefg"sv);
 								#else
 								    auto string = MUST(String::from_utf8("abc"sv));
 								    EXPECT_EQ(string.is_short_string(), true);
 								    EXPECT_EQ(string.bytes().size(), 3u);
 								    EXPECT_EQ(string.bytes_as_string_view(), "abc"sv);
 								#endif
 								}
 								TEST_CASE(long_strings)
 								{
 								    auto string = MUST(String::from_utf8("abcdefgh"sv));
 								    EXPECT_EQ(string.is_short_string(), false);
 								    EXPECT_EQ(string.bytes().size(), 8u);
 								    EXPECT_EQ(string.bytes_as_string_view(), "abcdefgh"sv);
 								}
 								TEST_CASE(substring)
 								{
 								    auto superstring = MUST(String::from_utf8("Hello I am a long string"sv));
 								    auto short_substring = MUST(superstring.substring_from_byte_offset(0, 5));
 								    EXPECT_EQ(short_substring, "Hello"sv);
 								    auto long_substring = MUST(superstring.substring_from_byte_offset(0, 10));
 								    EXPECT_EQ(long_substring, "Hello I am"sv);
 								}
 								TEST_CASE(code_points)
 								{
 								    auto string = MUST(String::from_utf8("🦬🪒"sv));
 								    Vector<u32> code_points;
 								    for (auto code_point : string.code_points())
 								        code_points.append(code_point);
 								    EXPECT_EQ(code_points[0], 0x1f9acu);
 								    EXPECT_EQ(code_points[1], 0x1fa92u);
 								}
 								TEST_CASE(string_builder)
 								{
 								    StringBuilder builder;
 								    builder.append_code_point(0x1f9acu);
 								    builder.append_code_point(0x1fa92u);
 								    auto string = MUST(builder.to_string());
 								    EXPECT_EQ(string, "🦬🪒"sv);
 								    EXPECT_EQ(string.bytes().size(), 8u);
 								}
 								TEST_CASE(ak_format)
 								{
 								    auto foo = MUST(String::formatted("Hello {}", MUST(String::from_utf8("friends"sv))));
 								    EXPECT_EQ(foo, "Hello friends"sv);
 								}
 								TEST_CASE(replace)
 								{
 								    {
 								        auto haystack = MUST(String::from_utf8("Hello enemies"sv));
 								        auto result = MUST(haystack.replace("enemies"sv, "friends"sv, ReplaceMode::All));
 								        EXPECT_EQ(result, "Hello friends"sv);
 								    }
 								    {
 								        auto base_title = MUST(String::from_utf8("anon@courage:~"sv));
 								        auto result = MUST(base_title.replace("[*]"sv, "(*)"sv, ReplaceMode::FirstOnly));
 								        EXPECT_EQ(result, "anon@courage:~"sv);
 								    }
 								}
-												AK+LibUnicode: Provide Unicode-aware String case transformations

Since AK can't refer to LibUnicode directly, the strategy here is that
if you need case transformations, you can link LibUnicode and receive
them. If you try to use either of these methods without linking it, then
you'll of course get a linker error (note we don't do any fallbacks to
e.g. ASCII case transformations). If you don't need these methods, you
don't have to link LibUnicode.

											
										
										
											2023-01-08 21:33:30 +00:00
 								TEST_CASE(to_lowercase)
 								{
 								    {
 								        auto string = MUST(String::from_utf8("Aa"sv));
 								        auto result = MUST(string.to_lowercase());
 								        EXPECT_EQ(result, "aa"sv);
 								    }
 								    {
 								        auto string = MUST(String::from_utf8("Ωω"sv));
 								        auto result = MUST(string.to_lowercase());
 								        EXPECT_EQ(result, "ωω"sv);
 								    }
 								    {
 								        auto string = MUST(String::from_utf8("İi̇"sv));
 								        auto result = MUST(string.to_lowercase());
 								        EXPECT_EQ(result, "i̇i̇"sv);
 								    }
 								}
 								TEST_CASE(to_uppercase)
 								{
 								    {
 								        auto string = MUST(String::from_utf8("Aa"sv));
 								        auto result = MUST(string.to_uppercase());
 								        EXPECT_EQ(result, "AA"sv);
 								    }
 								    {
 								        auto string = MUST(String::from_utf8("Ωω"sv));
 								        auto result = MUST(string.to_uppercase());
 								        EXPECT_EQ(result, "ΩΩ"sv);
 								    }
 								    {
 								        auto string = MUST(String::from_utf8("ŉ"sv));
 								        auto result = MUST(string.to_uppercase());
 								        EXPECT_EQ(result, "ʼN"sv);
 								    }
 								}