Updated nrex to v0.2

* Fixed capturing groups matching to invalid results
 * Fixed parents of recursive quantifiers not expanding properly
 * Fixed LookAhead sometimes adding to result
This commit is contained in:
Zher Huei Lee 2016-04-08 13:29:37 +01:00
parent d454e64f42
commit 6207d56b95
3 changed files with 113 additions and 43 deletions

View file

@ -1,6 +1,8 @@
# NREX: Node RegEx
Version 0.1
[![Build Status](https://travis-ci.org/leezh/nrex.svg?branch=master)](https://travis-ci.org/leezh/nrex)
** Version 0.2 **
Small node-based regular expression library. It only does text pattern
matchhing, not replacement. To use add the files `nrex.hpp`, `nrex.cpp`
@ -38,7 +40,7 @@ Currently supported features:
## License
Copyright (c) 2015, Zher Huei Lee
Copyright (c) 2015-2016, Zher Huei Lee
All rights reserved.
This software is provided 'as-is', without any express or implied
@ -59,3 +61,15 @@ freely, subject to the following restrictions:
3. This notice may not be removed or altered from any source
distribution.
# Changes
## Version 0.2 (2016-08-04)
* Fixed capturing groups matching to invalid results
* Fixed parents of recursive quantifiers not expanding properly
* Fixed LookAhead sometimes adding to result
* More verbose unit testing
## Version 0.1 (2015-12-04)
* Initial release

View file

@ -1,7 +1,7 @@
// NREX: Node RegEx
// Version 0.1
// Version 0.2
//
// Copyright (c) 2015, Zher Huei Lee
// Copyright (c) 2015-2016, Zher Huei Lee
// All rights reserved.
//
// This software is provided 'as-is', without any express or implied
@ -68,6 +68,13 @@ class nrex_array
{
}
nrex_array(unsigned int size)
: _data(NREX_NEW_ARRAY(T, size))
, _reserved(size)
, _size(0)
{
}
~nrex_array()
{
NREX_DELETE_ARRAY(_data);
@ -100,7 +107,7 @@ class nrex_array
_size++;
}
T& top()
const T& top() const
{
return _data[_size - 1];
}
@ -189,17 +196,19 @@ struct nrex_search
nrex_result* captures;
int end;
bool complete;
nrex_array<int> lookahead_pos;
nrex_char at(int pos)
{
return str[pos];
}
nrex_search(const nrex_char* str, nrex_result* captures)
nrex_search(const nrex_char* str, nrex_result* captures, int lookahead)
: str(str)
, captures(captures)
, end(0)
{
lookahead_pos.reserve(lookahead);
}
};
@ -239,13 +248,17 @@ struct nrex_node
{
pos = next->test(s, pos);
}
if (pos >= 0)
{
s->complete = true;
}
if (parent && pos >= 0)
{
pos = parent->test_parent(s, pos);
}
if (pos >= 0)
if (pos < 0)
{
s->complete = true;
s->complete = false;
}
return pos;
}
@ -274,25 +287,31 @@ struct nrex_node
}
};
enum nrex_group_type
{
nrex_group_capture,
nrex_group_non_capture,
nrex_group_bracket,
nrex_group_look_ahead,
nrex_group_look_behind,
};
struct nrex_node_group : public nrex_node
{
static const int NonCapture = -1;
static const int Bracket = -2;
static const int LookAhead = -3;
static const int LookBehind = -4;
int mode;
nrex_group_type type;
int id;
bool negate;
nrex_array<nrex_node*> childset;
nrex_node* back;
nrex_node_group(int mode)
nrex_node_group(nrex_group_type type, int id = 0)
: nrex_node(true)
, mode(mode)
, type(type)
, id(id)
, negate(false)
, back(NULL)
{
if (mode != Bracket)
if (type != nrex_group_bracket)
{
length = 0;
}
@ -300,7 +319,7 @@ struct nrex_node_group : public nrex_node
{
length = 1;
}
if (mode == LookAhead || mode == LookBehind)
if (type == nrex_group_look_ahead || type == nrex_group_look_behind)
{
quantifiable = false;
}
@ -317,15 +336,17 @@ struct nrex_node_group : public nrex_node
int test(nrex_search* s, int pos) const
{
if (mode >= 0)
int old_start;
if (type == nrex_group_capture)
{
s->captures[mode].start = pos;
old_start = s->captures[id].start;
s->captures[id].start = pos;
}
for (unsigned int i = 0; i < childset.size(); ++i)
{
s->complete = false;
int offset = 0;
if (mode == LookBehind)
if (type == nrex_group_look_behind)
{
if (pos < length)
{
@ -333,7 +354,15 @@ struct nrex_node_group : public nrex_node
}
offset = length;
}
if (type == nrex_group_look_ahead)
{
s->lookahead_pos.push(pos);
}
int res = childset[i]->test(s, pos - offset);
if (type == nrex_group_look_ahead)
{
s->lookahead_pos.pop();
}
if (s->complete)
{
return res;
@ -355,32 +384,40 @@ struct nrex_node_group : public nrex_node
}
if (res >= 0)
{
if (mode >= 0)
if (type == nrex_group_capture)
{
s->captures[mode].length = res - pos;
s->captures[id].length = res - pos;
}
else if (mode == LookAhead || mode == LookBehind)
else if (type == nrex_group_look_ahead || type == nrex_group_look_behind)
{
res = pos;
}
return next ? next->test(s, res) : res;
}
}
if (type == nrex_group_capture)
{
s->captures[id].start = old_start;
}
return -1;
}
virtual int test_parent(nrex_search* s, int pos) const
{
if (mode >= 0)
if (type == nrex_group_capture)
{
s->captures[mode].length = pos - s->captures[mode].start;
s->captures[id].length = pos - s->captures[id].start;
}
if (type == nrex_group_look_ahead)
{
pos = s->lookahead_pos[id];
}
return nrex_node::test_parent(s, pos);
}
void add_childset()
{
if (childset.size() > 0 && mode != Bracket)
if (childset.size() > 0 && type != nrex_group_bracket)
{
length = -1;
}
@ -391,7 +428,7 @@ struct nrex_node_group : public nrex_node
{
node->parent = this;
node->previous = back;
if (back && mode != Bracket)
if (back && type != nrex_group_bracket)
{
back->next = node;
}
@ -399,7 +436,7 @@ struct nrex_node_group : public nrex_node
{
childset.push(node);
}
if (mode != Bracket)
if (type != nrex_group_bracket)
{
increment_length(node->length);
}
@ -418,7 +455,7 @@ struct nrex_node_group : public nrex_node
{
childset.pop();
}
if (mode != Bracket)
if (type != nrex_group_bracket)
{
increment_length(old->length, true);
}
@ -436,7 +473,7 @@ struct nrex_node_group : public nrex_node
{
childset.pop();
}
if (mode != Bracket)
if (type != nrex_group_bracket)
{
increment_length(old->length, true);
}
@ -887,6 +924,12 @@ struct nrex_node_quantifier : public nrex_node
}
return -1;
}
virtual int test_parent(nrex_search* s, int pos) const
{
s->complete = false;
return pos;
}
};
struct nrex_node_anchor : public nrex_node
@ -986,7 +1029,7 @@ bool nrex_has_lookbehind(nrex_array<nrex_node_group*>& stack)
{
for (unsigned int i = 0; i < stack.size(); i++)
{
if (stack[i]->mode == nrex_node_group::LookBehind)
if (stack[i]->type == nrex_group_look_behind)
{
return true;
}
@ -996,12 +1039,14 @@ bool nrex_has_lookbehind(nrex_array<nrex_node_group*>& stack)
nrex::nrex()
: _capturing(0)
, _lookahead_depth(0)
, _root(NULL)
{
}
nrex::nrex(const nrex_char* pattern, int captures)
: _capturing(0)
, _lookahead_depth(0)
, _root(NULL)
{
compile(pattern, captures);
@ -1023,6 +1068,7 @@ bool nrex::valid() const
void nrex::reset()
{
_capturing = 0;
_lookahead_depth = 0;
if (_root)
{
NREX_DELETE(_root);
@ -1042,9 +1088,10 @@ int nrex::capture_size() const
bool nrex::compile(const nrex_char* pattern, int captures)
{
reset();
nrex_node_group* root = NREX_NEW(nrex_node_group(_capturing));
nrex_node_group* root = NREX_NEW(nrex_node_group(nrex_group_capture, _capturing));
nrex_array<nrex_node_group*> stack;
stack.push(root);
unsigned int lookahead_level = 0;
_root = root;
for (const nrex_char* c = pattern; c[0] != '\0'; ++c)
@ -1056,22 +1103,26 @@ bool nrex::compile(const nrex_char* pattern, int captures)
if (c[2] == ':')
{
c = &c[2];
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::NonCapture));
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_non_capture));
stack.top()->add_child(group);
stack.push(group);
}
else if (c[2] == '!' || c[2] == '=')
{
c = &c[2];
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::LookAhead));
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_look_ahead, lookahead_level++));
group->negate = (c[0] == '!');
stack.top()->add_child(group);
stack.push(group);
if (lookahead_level > _lookahead_depth)
{
_lookahead_depth = lookahead_level;
}
}
else if (c[2] == '<' && (c[3] == '!' || c[3] == '='))
{
c = &c[3];
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::LookBehind));
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_look_behind));
group->negate = (c[0] == '!');
stack.top()->add_child(group);
stack.push(group);
@ -1083,13 +1134,13 @@ bool nrex::compile(const nrex_char* pattern, int captures)
}
else if (captures >= 0 && _capturing < captures)
{
nrex_node_group* group = NREX_NEW(nrex_node_group(++_capturing));
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_capture, ++_capturing));
stack.top()->add_child(group);
stack.push(group);
}
else
{
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::NonCapture));
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_non_capture));
stack.top()->add_child(group);
stack.push(group);
}
@ -1098,6 +1149,10 @@ bool nrex::compile(const nrex_char* pattern, int captures)
{
if (stack.size() > 1)
{
if (stack.top()->type == nrex_group_look_ahead)
{
--lookahead_level;
}
stack.pop();
}
else
@ -1107,7 +1162,7 @@ bool nrex::compile(const nrex_char* pattern, int captures)
}
else if (c[0] == '[')
{
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::Bracket));
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_bracket));
stack.top()->add_child(group);
if (c[1] == '^')
{
@ -1410,7 +1465,7 @@ bool nrex::match(const nrex_char* str, nrex_result* captures, int offset, int en
{
return false;
}
nrex_search s(str, captures);
nrex_search s(str, captures, _lookahead_depth);
if (end >= offset)
{
s.end = end;

View file

@ -1,7 +1,7 @@
// NREX: Node RegEx
// Version 0.1
// Version 0.2
//
// Copyright (c) 2015, Zher Huei Lee
// Copyright (c) 2015-2016, Zher Huei Lee
// All rights reserved.
//
// This software is provided 'as-is', without any express or implied
@ -57,7 +57,8 @@ class nrex_node;
class nrex
{
private:
int _capturing;
unsigned int _capturing;
unsigned int _lookahead_depth;
nrex_node* _root;
public: