patterncMinor
A regular expression parsing library in C
Viewed 0 times
expressionregularparsinglibrary
Problem
I've created a regular expression (regex) parsing library in C, and would like some feedback on it. Speed is really important to me, but any and all suggestions are acceptable.
```
#include
static int regex_matchHere(const char regex, char s, int *len);
static int regex_matchGroup(int c, int group);
static int regex_matchQuantity(int quant, int c, const char regex, char s, int *len);
int regex_match(const char regex, char s, int *len)
{
char *p = s;
/ force match from the beginning of the string /
if (regex[0] == '^') return (regex_matchHere(regex + 1, s, len) ? 0 : -1);
/ iterate the string to find matching position /
do
{
*len = 0;
if (regex_matchHere(regex, p, len)) return (int)(p - s);
} while (*p++ != '\0');
return -1;
}
static int regex_matchHere(const char regex, char s, int *len)
{
int c = regex[0];
if (regex[0] == '\0') return 1; / end of regex = full match /
else if (regex[0] == '$' && regex[1] == '\0') return (s == '\0'); / check end of string */
else if (regex[0] == '\\' && regex[1] != '\0') / check escaped symbol /
{
c = regex[1];
if (c != '^' && c != '$' && c != '\\' && c != '+' && c != '*' && c != '-' && c != '?') c = c | 0x100;
regex = regex + 1;
}
/ check for special operators ,+,?,- */
if (regex[1] == '*' || regex[1] == '+' || regex[1] == '-' || regex[1] == '?') return regex_matchQuantity(regex[1], c, regex+2, s, len);
else if (s != '\0' && regex_matchGroup(s, c))
{
len = len + 1;
return regex_matchHere(regex+1, s+1, len);
}
return 0;
}
static int regex_matchGroup(int c, int group)
{
if ((group & 0xff) == '.') group ^= 0x100;
if (group s);
}
else if (quant == '-') / match as little as possible /
{
do
{
if (regex_matchHere(regex, s, len)) return 1;
len = len + 1;
} while (s != '\0' && regex_matchGroup(s++, c)
```
#include
static int regex_matchHere(const char regex, char s, int *len);
static int regex_matchGroup(int c, int group);
static int regex_matchQuantity(int quant, int c, const char regex, char s, int *len);
int regex_match(const char regex, char s, int *len)
{
char *p = s;
/ force match from the beginning of the string /
if (regex[0] == '^') return (regex_matchHere(regex + 1, s, len) ? 0 : -1);
/ iterate the string to find matching position /
do
{
*len = 0;
if (regex_matchHere(regex, p, len)) return (int)(p - s);
} while (*p++ != '\0');
return -1;
}
static int regex_matchHere(const char regex, char s, int *len)
{
int c = regex[0];
if (regex[0] == '\0') return 1; / end of regex = full match /
else if (regex[0] == '$' && regex[1] == '\0') return (s == '\0'); / check end of string */
else if (regex[0] == '\\' && regex[1] != '\0') / check escaped symbol /
{
c = regex[1];
if (c != '^' && c != '$' && c != '\\' && c != '+' && c != '*' && c != '-' && c != '?') c = c | 0x100;
regex = regex + 1;
}
/ check for special operators ,+,?,- */
if (regex[1] == '*' || regex[1] == '+' || regex[1] == '-' || regex[1] == '?') return regex_matchQuantity(regex[1], c, regex+2, s, len);
else if (s != '\0' && regex_matchGroup(s, c))
{
len = len + 1;
return regex_matchHere(regex+1, s+1, len);
}
return 0;
}
static int regex_matchGroup(int c, int group)
{
if ((group & 0xff) == '.') group ^= 0x100;
if (group s);
}
else if (quant == '-') / match as little as possible /
{
do
{
if (regex_matchHere(regex, s, len)) return 1;
len = len + 1;
} while (s != '\0' && regex_matchGroup(s++, c)
Solution
What you did well
The code seems clean and logically organized. I like your 0x100-bit hack to indicate special characters. You could make that convention more obvious in the comments, though.
What you could improve on
-
The return value of
Trying to return the position of the match just leads to confusion, reminiscent of the way PHP's
I suggest that the signature for
Alternatively, return a pointer to a new
-
Regular expressions often include modifier flags, such as a case-insensitive flag or a continue-searching-where-the-previous-match-ended flag. You might want to plan your interfaces accordingly. (To support the latter, the
-
For performance, regular expressions are frequently compiled into an automaton. You interpret the regular expressions as you go. You may wish design the library's interface to have a
-
The function name
-
You need unit tests!
The code seems clean and logically organized. I like your 0x100-bit hack to indicate special characters. You could make that convention more obvious in the comments, though.
What you could improve on
-
The return value of
regex_match() is weird. I'd like it to return a non-zero value if the match succeeded, and a zero value if the match failed, so that I can call it like this:if (regex_match(...)) {
// Do stuff for successful match
} else {
// Do stuff for failed match
}Trying to return the position of the match just leads to confusion, reminiscent of the way PHP's
strpos() returns 0 to indicate a successful match at the beginning of the subject (but FALSE to indicate a non-match). You don't want to be like PHP, do you?I suggest that the signature for
regex_match() should look like this:/**
* Returns 1 if matched, 0 if not matched.
*
* Pass a pointer to a match_result if you care to find out the
* details of the match (its length, position, and possibly other
* information supported in the future, such as parenthesized
* capture groups), or pass a NULL if you don't care about the details.
*/
int regex_match(const char *regex, const char *subject, struct match_result *result);Alternatively, return a pointer to a new
struct match_result if the match succeeded. The caller would have to free() the result later, though, so I don't like it as much.-
Regular expressions often include modifier flags, such as a case-insensitive flag or a continue-searching-where-the-previous-match-ended flag. You might want to plan your interfaces accordingly. (To support the latter, the
struct match_result* would probably become an in-out parameter rather than an out-parameter.)-
For performance, regular expressions are frequently compiled into an automaton. You interpret the regular expressions as you go. You may wish design the library's interface to have a
regex_compile() function that transforms the expression into a struct that is meaningful to your library but opaque to the user. For now, the "compilation" could just be the identity transformation; you can enhance it later when the need for better performance arises or when you enhance the feature set of the regular expressions.-
The function name
regex_matchGroup() confuses me. "Group" implies something like parentheses, I think. regex_matchAtom() might be a more appropriate name.-
You need unit tests!
Code Snippets
if (regex_match(...)) {
// Do stuff for successful match
} else {
// Do stuff for failed match
}/**
* Returns 1 if matched, 0 if not matched.
*
* Pass a pointer to a match_result if you care to find out the
* details of the match (its length, position, and possibly other
* information supported in the future, such as parenthesized
* capture groups), or pass a NULL if you don't care about the details.
*/
int regex_match(const char *regex, const char *subject, struct match_result *result);Context
StackExchange Code Review Q#44027, answer score: 9
Revisions (0)
No revisions yet.