patterncppModerate
Flexible string parser
Viewed 0 times
parserstringflexible
Problem
I am writing an mathematical expression parser, and therefore I need fast way to edit any string in any imaginable way.
My current solution: (compilable code)
```
#include
#include
#include
#include
#include
using namespace std;
class is_whitespace {
public:
inline bool operator() (const char & c)
{
return ( ' ' == c) || ('\n' == c) ||
('\r' == c) || ('\t' == c) ||
('\b' == c) || ('\v' == c) ||
('\f' == c);
}
};
inline void erase_whitespaces(string & expr) {
expr.erase(remove_if(expr.begin(), expr.end(), is_whitespace()), expr.end());
}
class FindDelim {
private:
string & out_delim;
public:
inline FindDelim(string & out) : out_delim(out) {}
public:
inline bool operator()(char & c, string & delim) {
if (string(&c, delim.size()) == delim) {
out_delim = delim;
return true;
}
return false;
}
};
inline void split_string(
vector & output_str,
vector & output_del,
const string::iterator & str_beg,
const string::iterator & str_end,
const vector::iterator & del_beg,
const vector::iterator & del_end)
{
string delim;
size_t delim_len;
string::iterator last = str_beg;
string::iterator next = str_beg;
while ((next = find_first_of(last, str_end, del_beg, del_end, FindDelim(delim))) != str_end) {
delim_len = delim.size();
output_str.push_back(string(&(*last), distance(last, next)));
output_del.push_back(string(&(*next), delim_len));
last = next + delim_len;
}
output_str.push_back(string(&(*last)));
}
inline void split_string(
vector & output_str,
vector & output_del,
const string::iterator & beg,
const string::iterator & end,
vector & delims)
{
return split_string(output_str, output_del, beg, end, delims.begin(), delims.end());
}
inline void split_string(
vector & output_str,
vector
My current solution: (compilable code)
```
#include
#include
#include
#include
#include
using namespace std;
class is_whitespace {
public:
inline bool operator() (const char & c)
{
return ( ' ' == c) || ('\n' == c) ||
('\r' == c) || ('\t' == c) ||
('\b' == c) || ('\v' == c) ||
('\f' == c);
}
};
inline void erase_whitespaces(string & expr) {
expr.erase(remove_if(expr.begin(), expr.end(), is_whitespace()), expr.end());
}
class FindDelim {
private:
string & out_delim;
public:
inline FindDelim(string & out) : out_delim(out) {}
public:
inline bool operator()(char & c, string & delim) {
if (string(&c, delim.size()) == delim) {
out_delim = delim;
return true;
}
return false;
}
};
inline void split_string(
vector & output_str,
vector & output_del,
const string::iterator & str_beg,
const string::iterator & str_end,
const vector::iterator & del_beg,
const vector::iterator & del_end)
{
string delim;
size_t delim_len;
string::iterator last = str_beg;
string::iterator next = str_beg;
while ((next = find_first_of(last, str_end, del_beg, del_end, FindDelim(delim))) != str_end) {
delim_len = delim.size();
output_str.push_back(string(&(*last), distance(last, next)));
output_del.push_back(string(&(*next), delim_len));
last = next + delim_len;
}
output_str.push_back(string(&(*last)));
}
inline void split_string(
vector & output_str,
vector & output_del,
const string::iterator & beg,
const string::iterator & end,
vector & delims)
{
return split_string(output_str, output_del, beg, end, delims.begin(), delims.end());
}
inline void split_string(
vector & output_str,
vector
Solution
You probably don't need to modify the input string. Assuming that whitespace is allowed only between tokens (i.e. that user's aren't expected to write
Prefer to use
Consider providing an input iterator interface for your tokeniser. That will enable consumers to gather the tokens into a standard collection, or to use them in streaming fashion without unnecessary overhead.
You should understand the implications of
You assign to
The
I suggest using standard predicates instead of writing your own -
I verified that the output is identical to your original.
532 as 5 32 or += as + =), then each token can be returned as a std::string_view - provided you're willing to keep the string alive for as long as you use any of the views.Prefer to use
const_iterator types when you won't be modifying the values; I think that all the iterators in split_string() can be const iterators if you change the signature of FindDelim::operator() to accept its arguments as reference to const:bool FindDelim::operator()(const char & c, const string & delim)
{
if (string(&c, delim.size()) == delim) {
out_delim = delim;
return true;
}
return false;
}
inline void split_string(
vector & output_str,
vector & output_del,
const string::const_iterator & str_beg,
const string::const_iterator & str_end,
const vector::const_iterator & del_beg,
const vector::const_iterator & del_end)
{
string delim;
size_t delim_len;
string::const_iterator last = str_beg;
string::const_iterator next = str_beg;
while ((next = find_first_of(last, str_end, del_beg, del_end, FindDelim(delim))) != str_end) {
delim_len = delim.size();
output_str.push_back(string(&(*last), distance(last, next)));
output_del.push_back(string(&(*next), delim_len));
last = next + delim_len;
}
output_str.push_back(string(&(*last)));
}
inline void split_string(
vector & output_str,
vector & output_del,
const string::const_iterator & beg,
const string::const_iterator & end,
vector & delims)
{
split_string(output_str, output_del, beg, end, delims.begin(), delims.end());
}
inline void split_string(
vector & output_str,
vector & output_del,
const string::const_iterator & beg,
const string::const_iterator & end,
vector && delims)
{
split_string(output_str, output_del, beg, end, delims.begin(), delims.end());
}Consider providing an input iterator interface for your tokeniser. That will enable consumers to gather the tokens into a standard collection, or to use them in streaming fashion without unnecessary overhead.
You should understand the implications of
using namespace std;. I would advise you to steer clear of it at file scope.You assign to
FindDelim::out_delim but never read from it - that suggests that it can be removed without changing the meaning.The
FindDelim::out_delim member is a reference that allows modification of non-owned objects. To me, I find that such references weaken the encapsulation of the class, and I prefer to avoid them (as evidenced by the fact I needed to edit my answer here). Perhaps there's a way to limit the action-at-a-distance that's easily overlooked. (I don't have a concrete suggestion here; perhaps some other answerer will propose an improvement).I suggest using standard predicates instead of writing your own -
is_whitespace is better written as std::isspace (you'll need to include `, of course) unless you really expect backspace in the input.
Although you may return void-expression; from a function that returns void, I would reserve this for use only in template functions that may or may not return void depending on their instantiation - not for functions that always return void.
Finally, please ditch the pointless read at the end of main() - it caused my run to hang waiting for input (which it won't get, in Emacs's compilation buffer...). It's not portable to rely on it without including , anyway.
Sample code
Here's an approach using std::regex` that addresses the points above. Its performance can obviously be improved, but I'd recommend doing that by swapping out the regex for a custom parser without changing the interface. You can see how the iterators provide flexibility and allow the use of standard algorithms:#include
#include
#include
#include
#include
#include
int main()
{
const std::string expr{"[X] += 2 + 100 + 32 + 231 -= 123 + 532"};
const std::regex separator{"\\+=|-="};
std::vector tokens;
std::copy(std::sregex_token_iterator(expr.begin(), expr.end(), separator, -1),
std::sregex_token_iterator(),
std::back_inserter(tokens));
// Remove spaces
static int (*const is_space)(int) = std::isspace;
for (auto& s: tokens)
s.erase(std::remove_if(s.begin(), s.end(), is_space), s.end());
// Print result:
std::copy(tokens.begin(), tokens.end(),
std::ostream_iterator(std::cout, "\n"));
}I verified that the output is identical to your original.
Code Snippets
bool FindDelim::operator()(const char & c, const string & delim)
{
if (string(&c, delim.size()) == delim) {
out_delim = delim;
return true;
}
return false;
}
inline void split_string(
vector<string> & output_str,
vector<string> & output_del,
const string::const_iterator & str_beg,
const string::const_iterator & str_end,
const vector<string>::const_iterator & del_beg,
const vector<string>::const_iterator & del_end)
{
string delim;
size_t delim_len;
string::const_iterator last = str_beg;
string::const_iterator next = str_beg;
while ((next = find_first_of(last, str_end, del_beg, del_end, FindDelim(delim))) != str_end) {
delim_len = delim.size();
output_str.push_back(string(&(*last), distance(last, next)));
output_del.push_back(string(&(*next), delim_len));
last = next + delim_len;
}
output_str.push_back(string(&(*last)));
}
inline void split_string(
vector<string> & output_str,
vector<string> & output_del,
const string::const_iterator & beg,
const string::const_iterator & end,
vector<string> & delims)
{
split_string(output_str, output_del, beg, end, delims.begin(), delims.end());
}
inline void split_string(
vector<string> & output_str,
vector<string> & output_del,
const string::const_iterator & beg,
const string::const_iterator & end,
vector<string> && delims)
{
split_string(output_str, output_del, beg, end, delims.begin(), delims.end());
}#include <algorithm>
#include <iostream>
#include <iterator>
#include <regex>
#include <string>
#include <vector>
int main()
{
const std::string expr{"[X] += 2 + 100 + 32 + 231 -= 123 + 532"};
const std::regex separator{"\\+=|-="};
std::vector<std::string> tokens;
std::copy(std::sregex_token_iterator(expr.begin(), expr.end(), separator, -1),
std::sregex_token_iterator(),
std::back_inserter(tokens));
// Remove spaces
static int (*const is_space)(int) = std::isspace;
for (auto& s: tokens)
s.erase(std::remove_if(s.begin(), s.end(), is_space), s.end());
// Print result:
std::copy(tokens.begin(), tokens.end(),
std::ostream_iterator<std::string>(std::cout, "\n"));
}Context
StackExchange Code Review Q#162237, answer score: 11
Revisions (0)
No revisions yet.