patterncppMinor
Beating memcmp in C++
Viewed 0 times
beatingmemcmpstackoverflow
Problem
Once again I decided to beat system
Result is 400% improvement - from about 1:15 min to 0:25 min.
Finally I had rewritten
However I did not tested with random data, so I am not sure what role the cache line and branch predictor plays in the tests.
Here is the code:
```
#include
#include
namespace{
template
int memcmp_fixed_(const unsigned char s1, const unsigned char s2){
for(size_t i = 0; i
int memcmp_fixed_(const unsigned char s1, const unsigned char s2){
return s1 - s2;
}
template
int memcmp_fixed_(const void a1, const void a2){
const unsigned char s1 = (const unsigned char ) a1;
const unsigned char s2 = (const unsigned char ) a2;
return memcmp_fixed_(s1, s2);
}
}
inline int fast_memcmp(const void a1, const void a2, size_t const size){
switch(size){
case 0: return 0;
case 1: return memcmp_fixed_(a1, a2);
case 2: return memcmp_fixed_(a1, a2);
case 3: return memcmp_fixed_(a1, a2);
case 4: return memcmp_fixed_(a1, a2);
case 5: return memcmp_fixed_(a1, a2);
case 6: return memcmp_fixed_(a1, a2);
case 7: return memcmp_fixed_(a1, a2);
case 8: return memcmp_fixed_(a1, a2);
case 9: return memcmp_fixed_(a1, a2);
case 10: return memcmp_fixed_(a1, a2);
case 21: return memcmp_fixed_(a1, a2);
case 22: return memcmp_fixed_(a1, a2);
case 23: return memcmp_fixed_(a1, a2);
case 24: return memcmp_fixed_(a1, a2);
case 25: return memcmp_fixed_(a1, a2);
case 26: return memcmp_fixed_(a1, a2);
case 27: return memcmp_fixed_(a1, a2);
case 28: return memcmp_fixed_(a1, a2);
case 29: return memcmp_fixed_(a1, a2);
case 30: return memcmp_fixed_(a1, a2);
memcmp function. This time I decided to use a template and to "precompile" all cases from 0 to 31 bytes.Result is 400% improvement - from about 1:15 min to 0:25 min.
Finally I had rewritten
memcmp_fixed_ with naive looking for statement and I noticed that the compiler can optimize it as well.However I did not tested with random data, so I am not sure what role the cache line and branch predictor plays in the tests.
Here is the code:
```
#include
#include
namespace{
template
int memcmp_fixed_(const unsigned char s1, const unsigned char s2){
for(size_t i = 0; i
int memcmp_fixed_(const unsigned char s1, const unsigned char s2){
return s1 - s2;
}
template
int memcmp_fixed_(const void a1, const void a2){
const unsigned char s1 = (const unsigned char ) a1;
const unsigned char s2 = (const unsigned char ) a2;
return memcmp_fixed_(s1, s2);
}
}
inline int fast_memcmp(const void a1, const void a2, size_t const size){
switch(size){
case 0: return 0;
case 1: return memcmp_fixed_(a1, a2);
case 2: return memcmp_fixed_(a1, a2);
case 3: return memcmp_fixed_(a1, a2);
case 4: return memcmp_fixed_(a1, a2);
case 5: return memcmp_fixed_(a1, a2);
case 6: return memcmp_fixed_(a1, a2);
case 7: return memcmp_fixed_(a1, a2);
case 8: return memcmp_fixed_(a1, a2);
case 9: return memcmp_fixed_(a1, a2);
case 10: return memcmp_fixed_(a1, a2);
case 21: return memcmp_fixed_(a1, a2);
case 22: return memcmp_fixed_(a1, a2);
case 23: return memcmp_fixed_(a1, a2);
case 24: return memcmp_fixed_(a1, a2);
case 25: return memcmp_fixed_(a1, a2);
case 26: return memcmp_fixed_(a1, a2);
case 27: return memcmp_fixed_(a1, a2);
case 28: return memcmp_fixed_(a1, a2);
case 29: return memcmp_fixed_(a1, a2);
case 30: return memcmp_fixed_(a1, a2);
Solution
Since we're using the C++ wrappers around standard C functions (which is good - I like it!), we need namespace qualification of
Instead of repeating the type when casting
I managed to simplify
I slightly simplified the test program to make it a single-character change to build the baseline test (and to send error messages where they belong), and massively reducing the repeated code:
std::size_t and std::memcmp). Although your implementation evidently takes advantage of the license to also include the unqualified names, it's not portable to rely on that.Instead of repeating the type when casting
a1 to s1 and a2 to s2, we can just use auto (and let's be clear about the cast - prefer reinterpret_cast to a catch-all C-style cast).I managed to simplify
memcmp_fixed_ by using a recursive template. For me (with g++ -03), this gave roughly the same execution speed (I guess that loop unrolling makes the resultant binary very similar). I got rid of the separate void and unsigned char overloads - that's just a compile-time overhead and makes no different to runtime speed, which gives 9 lines, compared to your 14 (counting physical lines that contain alphanumerics).template
int memcmp_fixed_(const void *a1, const void *a2)
{
auto const s1 = reinterpret_cast(a1);
auto const s2 = reinterpret_cast(a2);
auto const diff = *s1 - *s2;
return diff ? diff : memcmp_fixed_(s1+1, s2+1);
}
template<>
int memcmp_fixed_(const void*, const void*)
{
return 0;
}I slightly simplified the test program to make it a single-character change to build the baseline test (and to send error messages where they belong), and massively reducing the repeated code:
#if 1
#define test_memcmp fast_memcmp
#else
#define test_memcmp std::memcmp
#endif
int main(int argc, char **argv)
{
if (argc != 3) {
std::fprintf(stderr, "Usage:\n\t%s [string1] [string2]\n", argv[0]);
return 1;
}
const char *s1 = argv[1];
const char *s2 = argv[2];
auto const size = std::min(strlen(s1), strlen(s2));
volatile int x = 0;
for (volatile std::size_t i = 0; i < MAX; ++i)
x += test_memcmp(s1, s2, size);
std::printf("%d %d\n", test_memcmp(s1, s2, size), x);
}Code Snippets
template<std::size_t SIZE>
int memcmp_fixed_(const void *a1, const void *a2)
{
auto const s1 = reinterpret_cast<const unsigned char*>(a1);
auto const s2 = reinterpret_cast<const unsigned char*>(a2);
auto const diff = *s1 - *s2;
return diff ? diff : memcmp_fixed_<SIZE-1>(s1+1, s2+1);
}
template<>
int memcmp_fixed_<0>(const void*, const void*)
{
return 0;
}#if 1
#define test_memcmp fast_memcmp
#else
#define test_memcmp std::memcmp
#endif
int main(int argc, char **argv)
{
if (argc != 3) {
std::fprintf(stderr, "Usage:\n\t%s [string1] [string2]\n", argv[0]);
return 1;
}
const char *s1 = argv[1];
const char *s2 = argv[2];
auto const size = std::min(strlen(s1), strlen(s2));
volatile int x = 0;
for (volatile std::size_t i = 0; i < MAX; ++i)
x += test_memcmp(s1, s2, size);
std::printf("%d %d\n", test_memcmp(s1, s2, size), x);
}Context
StackExchange Code Review Q#162984, answer score: 4
Revisions (0)
No revisions yet.