serenity/AK/GenericLexer.h
Timothy Flynn fd8ccedf2b AK: Add GenericLexer API to consume an escaped Unicode code point
This parsing is already duplicated between LibJS and LibRegex, and will
shortly be needed in more places in those libraries. Move it to AK to
prevent further duplication.

This API will consume escaped Unicode code points of the form:
    \\u{code point}
    \\unnnn (where each n is a hexadecimal digit)
    \\unnnn\\unnnn (where the two escaped values are a surrogate pair)
2021-08-19 23:49:25 +02:00

232 lines
5.8 KiB
C++

/*
* Copyright (c) 2020, Benoit Lormeau <blormeau@outlook.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Result.h>
#include <AK/StringView.h>
namespace AK {
class GenericLexer {
public:
constexpr explicit GenericLexer(const StringView& input)
: m_input(input)
{
}
constexpr size_t tell() const { return m_index; }
constexpr size_t tell_remaining() const { return m_input.length() - m_index; }
StringView remaining() const { return m_input.substring_view(m_index); }
constexpr bool is_eof() const { return m_index >= m_input.length(); }
constexpr char peek(size_t offset = 0) const
{
return (m_index + offset < m_input.length()) ? m_input[m_index + offset] : '\0';
}
constexpr bool next_is(char expected) const
{
return peek() == expected;
}
constexpr bool next_is(StringView expected) const
{
for (size_t i = 0; i < expected.length(); ++i)
if (peek(i) != expected[i])
return false;
return true;
}
constexpr bool next_is(const char* expected) const
{
for (size_t i = 0; expected[i] != '\0'; ++i)
if (peek(i) != expected[i])
return false;
return true;
}
constexpr void retreat()
{
VERIFY(m_index > 0);
--m_index;
}
constexpr void retreat(size_t count)
{
VERIFY(m_index >= count);
m_index -= count;
}
constexpr char consume()
{
VERIFY(!is_eof());
return m_input[m_index++];
}
template<typename T>
constexpr bool consume_specific(const T& next)
{
if (!next_is(next))
return false;
if constexpr (requires { next.length(); }) {
ignore(next.length());
} else {
ignore(sizeof(next));
}
return true;
}
bool consume_specific(const String& next)
{
return consume_specific(StringView { next });
}
constexpr bool consume_specific(const char* next)
{
return consume_specific(StringView { next });
}
constexpr char consume_escaped_character(char escape_char = '\\', const StringView& escape_map = "n\nr\rt\tb\bf\f")
{
if (!consume_specific(escape_char))
return consume();
auto c = consume();
for (size_t i = 0; i < escape_map.length(); i += 2) {
if (c == escape_map[i])
return escape_map[i + 1];
}
return c;
}
StringView consume(size_t count);
StringView consume_all();
StringView consume_line();
StringView consume_until(char);
StringView consume_until(const char*);
StringView consume_quoted_string(char escape_char = 0);
String consume_and_unescape_string(char escape_char = '\\');
enum class UnicodeEscapeError {
MalformedUnicodeEscape,
UnicodeEscapeOverflow,
};
Result<u32, UnicodeEscapeError> consume_escaped_code_point(bool combine_surrogate_pairs = true);
constexpr void ignore(size_t count = 1)
{
count = min(count, m_input.length() - m_index);
m_index += count;
}
constexpr void ignore_until(char stop)
{
while (!is_eof() && peek() != stop) {
++m_index;
}
ignore();
}
constexpr void ignore_until(const char* stop)
{
while (!is_eof() && !next_is(stop)) {
++m_index;
}
ignore(__builtin_strlen(stop));
}
/*
* Conditions are used to match arbitrary characters. You can use lambdas,
* ctype functions, or is_any_of() and its derivatives (see below).
* A few examples:
* - `if (lexer.next_is(isdigit))`
* - `auto name = lexer.consume_while([](char c) { return isalnum(c) || c == '_'; });`
* - `lexer.ignore_until(is_any_of("<^>"));`
*/
// Test the next character against a Condition
template<typename TPredicate>
constexpr bool next_is(TPredicate pred) const
{
return pred(peek());
}
// Consume and return characters while `pred` returns true
template<typename TPredicate>
StringView consume_while(TPredicate pred)
{
size_t start = m_index;
while (!is_eof() && pred(peek()))
++m_index;
size_t length = m_index - start;
if (length == 0)
return {};
return m_input.substring_view(start, length);
}
// Consume and return characters until `pred` return true
template<typename TPredicate>
StringView consume_until(TPredicate pred)
{
size_t start = m_index;
while (!is_eof() && !pred(peek()))
++m_index;
size_t length = m_index - start;
if (length == 0)
return {};
return m_input.substring_view(start, length);
}
// Ignore characters while `pred` returns true
template<typename TPredicate>
constexpr void ignore_while(TPredicate pred)
{
while (!is_eof() && pred(peek()))
++m_index;
}
// Ignore characters until `pred` return true
// We don't skip the stop character as it may not be a unique value
template<typename TPredicate>
constexpr void ignore_until(TPredicate pred)
{
while (!is_eof() && !pred(peek()))
++m_index;
}
protected:
StringView m_input;
size_t m_index { 0 };
private:
Result<u32, UnicodeEscapeError> decode_code_point();
Result<u32, UnicodeEscapeError> decode_single_or_paired_surrogate(bool combine_surrogate_pairs);
};
constexpr auto is_any_of(const StringView& values)
{
return [values](auto c) { return values.contains(c); };
}
constexpr auto is_path_separator = is_any_of("/\\");
constexpr auto is_quote = is_any_of("'\"");
}
using AK::GenericLexer;
using AK::is_any_of;
using AK::is_path_separator;
using AK::is_quote;