1
0
mirror of https://github.com/SerenityOS/serenity synced 2024-07-09 05:20:45 +00:00

LibXML+Tests: Consume > in the character data ending ]]> and test it

For example, with this input:
```xml
<C>]]>
```
After seeing `<C>`, the parser will start parsing the content of the
element. The content parser will then parse any character data it sees.

The character parser would see the first two `]]` and consume them.
Then, it would see the `>` and set the state machine to say we have
seen this, but it did _not_ consume it and would instead tell
GenericLexer that it should stop consuming characters. Therefore,
we only consumed 2 characters.

Then, it would see that we are in the state where we've seen the
full `]]>` and try to take off three characters from the end of the
consumed input when we only have 2 characters, causing an assertion
failure as we are asking to take off more characters than there really
is.
This commit is contained in:
Luke Wilde 2022-05-29 22:25:43 +01:00 committed by Linus Groh
parent 9a97ffe883
commit adb5f7e485
4 changed files with 32 additions and 2 deletions

View File

@ -23,6 +23,7 @@ add_subdirectory(LibTimeZone)
add_subdirectory(LibUnicode)
add_subdirectory(LibWasm)
add_subdirectory(LibWeb)
add_subdirectory(LibXML)
if (${SERENITY_ARCH} STREQUAL "i686")
add_subdirectory(UserspaceEmulator)
endif()

View File

@ -0,0 +1,7 @@
set(TEST_SOURCES
TestParser.cpp
)
foreach(source IN LISTS TEST_SOURCES)
serenity_test("${source}" LibXML LIBS LibXML)
endforeach()

View File

@ -0,0 +1,22 @@
/*
* Copyright (c) 2022, Luke Wilde <lukew@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibTest/TestCase.h>
#include <LibXML/Parser/Parser.h>
TEST_CASE(char_data_ending)
{
EXPECT_NO_CRASH("parsing character data ending by itself should not crash", [] {
// After seeing `<C>`, the parser will start parsing the content of the element. The content parser will then parse any character data it sees.
// The character parser would see the first two `]]` and consume them. Then, it would see the `>` and set the state machine to say we have seen this,
// but it did _not_ consume it and would instead tell GenericLexer that it should stop consuming characters. Therefore, we only consumed 2 characters.
// Then, it would see that we are in the state where we've seen the full `]]>` and try to take off three characters from the end of the consumed
// input when we only have 2 characters, causing an assertion failure as we are asking to take off more characters than there really is.
XML::Parser parser("<C>]]>");
(void)parser.parse();
return Test::Crash::Failure::DidNotCrash;
});
}

View File

@ -891,7 +891,7 @@ ErrorOr<StringView, ParseError> Parser::parse_char_data()
// CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
auto cend_state = 0; // 1: ], 2: ], 3: >
auto text = m_lexer.consume_while([&](auto ch) {
if (ch == '<' || ch == '&')
if (ch == '<' || ch == '&' || cend_state == 3)
return false;
switch (cend_state) {
case 0:
@ -904,7 +904,7 @@ ErrorOr<StringView, ParseError> Parser::parse_char_data()
case 2:
if (ch == '>') {
cend_state++;
return false;
return true;
}
cend_state = 0;
return true;