patterncppMinor
Optimising single-delimiter string tokenisation
Viewed 0 times
optimisingsingledelimitertokenisationstring
Problem
I am trying to optimise my tokenizing of tab delimited strings:
where
This function is called on the result from each
Each
Does anyone have advice for squeezing more speed out of this?
static void split_line(string &line, input_record &rec)
{
int col = 0;
char *row = &line[0];
char *token = strtok(row, "\t");
while (token)
{
switch(++col)
{
case 2 : rec.sequence = token; break;
case 4 : rec.content = token; break;
case 9 : rec.position = atoi(token); return;
}
token = strtok(NULL, "\t");
}
}where
struct typedef
{
string sequence;
string content;
unsigned int position;
} input_recordThis function is called on the result from each
getline from a parent, causing split_line to be called over 100 million times.Each
line has at least 15 columns (of which 2, 4 and 9 are useful), and I assign the relevant tokens to variables in a struct. The first 10 columns of a string contain roughly 150 characters (variable length) and I am handling millions of records. Currently, it takes ~0.8µs to process a string, and is a bottleneck in my code.Does anyone have advice for squeezing more speed out of this?
Solution
I tried the following change to your function. It does not use
Here's the test program and results from my testing:
Results:
~/Stack-Overflow/cpp>>./test-507 "1,abcd,3,xyz,4,5,6,7,8,9,10,11" 50000000
Time: 3
Time: 2
~/Stack-Overflow/cpp>>./test-507 "1,abcd,3,xyz,4,5,6,7,8,9,10,11" 100000000
Time: 6
Time: 4
~/Stack-Overflow/cpp>>./test-507 "1,abcd,3,xyz,4,5,6,7,8,9,10,11" 200000000
Time: 13
Time: 9
~/Stack-Overflow/cpp>>./test-507 "1,abcd,3,xyz,4,5,6,7,8,9,10,11" 400000000
Time: 26
Time: 18
Update
If I change the
~/Stack-Overflow/cpp>>./test-507 "1,abcd,3,xyz,4,5,6,7,8,9,10,11" 400000000
Time: 16
Time: 8
If that is an option, that will be save you a bunch of time.
strtok. It does the tokenizing in the function itself. I got a little bit of speed up.void split_line2(string &line, input_record &rec)
{
int col = 0;
char *start = &line[0];
char sep = ','; // Using ',' instead of '\t' for my testing.
for ( char* iter = start; *iter != '\0'; ++iter )
{
if ( *iter == sep )
{
*iter = '\0';
switch(++col)
{
case 2 : rec.sequence = start; break;
case 4 : rec.content = start; break;
case 9 : rec.position = atoi(start); return;
}
start = iter+1;
}
}
}Here's the test program and results from my testing:
#include
#include
#include
#include
using std::string;
struct input_record
{
string sequence;
string content;
unsigned int position;
};
void split_line1(string &line, input_record &rec)
{
int col = 0;
char *row = &line[0];
char *token = strtok(row, ",");
while (token)
{
switch(++col)
{
case 2 : rec.sequence = token; break;
case 4 : rec.content = token; break;
case 9 : rec.position = atoi(token); return;
}
token = strtok(NULL, ",");
}
}
void split_line2(string &line, input_record &rec)
{
int col = 0;
char *start = &line[0];
char sep = ',';
for ( char* iter = start; *iter != '\0'; ++iter )
{
if ( *iter == sep )
{
*iter = '\0';
switch(++col)
{
case 2 : rec.sequence = start; break;
case 4 : rec.content = start; break;
case 9 : rec.position = atoi(start); return;
}
start = iter+1;
}
}
}
void test1(std::string &line, int n)
{
clock_t start = clock();
for ( int i = 0; i < n; ++i )
{
input_record rec;
split_line1(line, rec);
}
clock_t end = clock();
std::cout << "Time: " << (end-start)/CLOCKS_PER_SEC << std::endl;
}
void test2(std::string &line, int n)
{
clock_t start = clock();
for ( int i = 0; i < n; ++i )
{
input_record rec;
split_line2(line, rec);
}
clock_t end = clock();
std::cout << "Time: " << (end-start)/CLOCKS_PER_SEC << std::endl;
}
int main(int argc, char** argv)
{
std::string line(argv[1]);
int n = atoi(argv[2]);
test1(line, n);
test2(line, n);
}Results:
~/Stack-Overflow/cpp>>./test-507 "1,abcd,3,xyz,4,5,6,7,8,9,10,11" 50000000
Time: 3
Time: 2
~/Stack-Overflow/cpp>>./test-507 "1,abcd,3,xyz,4,5,6,7,8,9,10,11" 100000000
Time: 6
Time: 4
~/Stack-Overflow/cpp>>./test-507 "1,abcd,3,xyz,4,5,6,7,8,9,10,11" 200000000
Time: 13
Time: 9
~/Stack-Overflow/cpp>>./test-507 "1,abcd,3,xyz,4,5,6,7,8,9,10,11" 400000000
Time: 26
Time: 18
Update
If I change the
struct from containing std::string to char*, there is substantial savings.struct input_record
{
char* sequence;
char* content;
unsigned int position;
};~/Stack-Overflow/cpp>>./test-507 "1,abcd,3,xyz,4,5,6,7,8,9,10,11" 400000000
Time: 16
Time: 8
If that is an option, that will be save you a bunch of time.
Code Snippets
void split_line2(string &line, input_record &rec)
{
int col = 0;
char *start = &line[0];
char sep = ','; // Using ',' instead of '\t' for my testing.
for ( char* iter = start; *iter != '\0'; ++iter )
{
if ( *iter == sep )
{
*iter = '\0';
switch(++col)
{
case 2 : rec.sequence = start; break;
case 4 : rec.content = start; break;
case 9 : rec.position = atoi(start); return;
}
start = iter+1;
}
}
}#include <iostream>
#include <string>
#include <ctime>
#include <cstring>
using std::string;
struct input_record
{
string sequence;
string content;
unsigned int position;
};
void split_line1(string &line, input_record &rec)
{
int col = 0;
char *row = &line[0];
char *token = strtok(row, ",");
while (token)
{
switch(++col)
{
case 2 : rec.sequence = token; break;
case 4 : rec.content = token; break;
case 9 : rec.position = atoi(token); return;
}
token = strtok(NULL, ",");
}
}
void split_line2(string &line, input_record &rec)
{
int col = 0;
char *start = &line[0];
char sep = ',';
for ( char* iter = start; *iter != '\0'; ++iter )
{
if ( *iter == sep )
{
*iter = '\0';
switch(++col)
{
case 2 : rec.sequence = start; break;
case 4 : rec.content = start; break;
case 9 : rec.position = atoi(start); return;
}
start = iter+1;
}
}
}
void test1(std::string &line, int n)
{
clock_t start = clock();
for ( int i = 0; i < n; ++i )
{
input_record rec;
split_line1(line, rec);
}
clock_t end = clock();
std::cout << "Time: " << (end-start)/CLOCKS_PER_SEC << std::endl;
}
void test2(std::string &line, int n)
{
clock_t start = clock();
for ( int i = 0; i < n; ++i )
{
input_record rec;
split_line2(line, rec);
}
clock_t end = clock();
std::cout << "Time: " << (end-start)/CLOCKS_PER_SEC << std::endl;
}
int main(int argc, char** argv)
{
std::string line(argv[1]);
int n = atoi(argv[2]);
test1(line, n);
test2(line, n);
}struct input_record
{
char* sequence;
char* content;
unsigned int position;
};Context
StackExchange Code Review Q#79624, answer score: 2
Revisions (0)
No revisions yet.