HiveBrain v1.2.0
Get Started
← Back to all entries
patterncppMinor

Optimising single-delimiter string tokenisation

Submitted by: @import:stackexchange-codereview··
0
Viewed 0 times
optimisingsingledelimitertokenisationstring

Problem

I am trying to optimise my tokenizing of tab delimited strings:

static void split_line(string &line, input_record &rec)
{
    int col = 0;
    char *row = &line[0];
    char *token = strtok(row, "\t");
    while (token)
    {
        switch(++col)
        {
            case 2 : rec.sequence = token; break;
            case 4 : rec.content = token; break;
            case 9 : rec.position = atoi(token); return;
        }
        token = strtok(NULL, "\t");
    }
}


where

struct typedef
{
    string sequence;
    string content;
    unsigned int position;
} input_record


This function is called on the result from each getline from a parent, causing split_line to be called over 100 million times.

Each line has at least 15 columns (of which 2, 4 and 9 are useful), and I assign the relevant tokens to variables in a struct. The first 10 columns of a string contain roughly 150 characters (variable length) and I am handling millions of records. Currently, it takes ~0.8µs to process a string, and is a bottleneck in my code.

Does anyone have advice for squeezing more speed out of this?

Solution

I tried the following change to your function. It does not use strtok. It does the tokenizing in the function itself. I got a little bit of speed up.

void split_line2(string &line, input_record &rec)
{
    int col = 0;
    char *start = &line[0];
    char sep = ','; // Using ',' instead of '\t' for my testing.
    for ( char* iter = start; *iter != '\0'; ++iter )
    {
       if ( *iter == sep )
       {
          *iter = '\0';
          switch(++col)
          {
             case 2 : rec.sequence = start; break;
             case 4 : rec.content = start; break;
             case 9 : rec.position = atoi(start); return;
          }
          start = iter+1;
       }
    }
}


Here's the test program and results from my testing:

#include 
#include 
#include 
#include 

using std::string;

struct input_record
{
    string sequence;
    string content;
    unsigned int position;
};

void split_line1(string &line, input_record &rec)
{
    int col = 0;
    char *row = &line[0];
    char *token = strtok(row, ",");
    while (token)
    {
        switch(++col)
        {
            case 2 : rec.sequence = token; break;
            case 4 : rec.content = token; break;
            case 9 : rec.position = atoi(token); return;
        }
        token = strtok(NULL, ",");
    }
}

void split_line2(string &line, input_record &rec)
{
    int col = 0;
    char *start = &line[0];
    char sep = ',';
    for ( char* iter = start; *iter != '\0'; ++iter )
    {
       if ( *iter == sep )
       {
          *iter = '\0';
          switch(++col)
          {
             case 2 : rec.sequence = start; break;
             case 4 : rec.content = start; break;
             case 9 : rec.position = atoi(start); return;
          }
          start = iter+1;
       }
    }
}

void test1(std::string &line, int n)
{
   clock_t start = clock();

   for ( int i = 0; i < n; ++i )
   {
      input_record rec;
      split_line1(line, rec);
   }

   clock_t end = clock();
   std::cout << "Time: " << (end-start)/CLOCKS_PER_SEC << std::endl;
}

void test2(std::string &line, int n)
{
   clock_t start = clock();

   for ( int i = 0; i < n; ++i )
   {
      input_record rec;
      split_line2(line, rec);
   }

   clock_t end = clock();
   std::cout << "Time: " << (end-start)/CLOCKS_PER_SEC << std::endl;
}

int main(int argc, char** argv)
{
   std::string line(argv[1]);
   int n = atoi(argv[2]);
   test1(line, n);
   test2(line, n);
}


Results:

~/Stack-Overflow/cpp>>./test-507 "1,abcd,3,xyz,4,5,6,7,8,9,10,11" 50000000
Time: 3
Time: 2

~/Stack-Overflow/cpp>>./test-507 "1,abcd,3,xyz,4,5,6,7,8,9,10,11" 100000000
Time: 6
Time: 4

~/Stack-Overflow/cpp>>./test-507 "1,abcd,3,xyz,4,5,6,7,8,9,10,11" 200000000
Time: 13
Time: 9

~/Stack-Overflow/cpp>>./test-507 "1,abcd,3,xyz,4,5,6,7,8,9,10,11" 400000000
Time: 26
Time: 18

Update

If I change the struct from containing std::string to char*, there is substantial savings.

struct input_record
{
    char* sequence;
    char* content;
    unsigned int position;
};


~/Stack-Overflow/cpp>>./test-507 "1,abcd,3,xyz,4,5,6,7,8,9,10,11" 400000000
Time: 16
Time: 8

If that is an option, that will be save you a bunch of time.

Code Snippets

void split_line2(string &line, input_record &rec)
{
    int col = 0;
    char *start = &line[0];
    char sep = ','; // Using ',' instead of '\t' for my testing.
    for ( char* iter = start; *iter != '\0'; ++iter )
    {
       if ( *iter == sep )
       {
          *iter = '\0';
          switch(++col)
          {
             case 2 : rec.sequence = start; break;
             case 4 : rec.content = start; break;
             case 9 : rec.position = atoi(start); return;
          }
          start = iter+1;
       }
    }
}
#include <iostream>
#include <string>
#include <ctime>
#include <cstring>

using std::string;

struct input_record
{
    string sequence;
    string content;
    unsigned int position;
};

void split_line1(string &line, input_record &rec)
{
    int col = 0;
    char *row = &line[0];
    char *token = strtok(row, ",");
    while (token)
    {
        switch(++col)
        {
            case 2 : rec.sequence = token; break;
            case 4 : rec.content = token; break;
            case 9 : rec.position = atoi(token); return;
        }
        token = strtok(NULL, ",");
    }
}

void split_line2(string &line, input_record &rec)
{
    int col = 0;
    char *start = &line[0];
    char sep = ',';
    for ( char* iter = start; *iter != '\0'; ++iter )
    {
       if ( *iter == sep )
       {
          *iter = '\0';
          switch(++col)
          {
             case 2 : rec.sequence = start; break;
             case 4 : rec.content = start; break;
             case 9 : rec.position = atoi(start); return;
          }
          start = iter+1;
       }
    }
}

void test1(std::string &line, int n)
{
   clock_t start = clock();

   for ( int i = 0; i < n; ++i )
   {
      input_record rec;
      split_line1(line, rec);
   }

   clock_t end = clock();
   std::cout << "Time: " << (end-start)/CLOCKS_PER_SEC << std::endl;
}

void test2(std::string &line, int n)
{
   clock_t start = clock();

   for ( int i = 0; i < n; ++i )
   {
      input_record rec;
      split_line2(line, rec);
   }

   clock_t end = clock();
   std::cout << "Time: " << (end-start)/CLOCKS_PER_SEC << std::endl;
}

int main(int argc, char** argv)
{
   std::string line(argv[1]);
   int n = atoi(argv[2]);
   test1(line, n);
   test2(line, n);
}
struct input_record
{
    char* sequence;
    char* content;
    unsigned int position;
};

Context

StackExchange Code Review Q#79624, answer score: 2

Revisions (0)

No revisions yet.