patterncMajor

Copying 80 bytes as fast as possible

Submitted by: @import:stackexchange-codereview·Mar 10, 2026·

Viewed 0 times

codereview sse c stackoverflow optimization bitwise

fastpossiblecopyingbytes

Problem

I am running a math-oriented computation that spends a significant amount of its time doing memcpy, always copying 80 bytes from one location to the next, an array of 20 32-bit ints. The total computation takes around 4-5 days using both cores of my i7, so even a 1% speedup results in about an hour saved.

By using the memcpy in this paper by Intel, I was able to speed up by about 25%, and also dropping the size argument and simply declaring inside seems to have some small effect. However, I feel I am not utilising the fact that my copying operations are always the same size. That said, I can't come up with a better way.

void *memcpyi80(void* __restrict b, const void* __restrict a){
    size_t n = 80;
    char *s1 = b;
    const char *s2 = a;
    for(; 0<n; --n)*s1++ = *s2++;
    return b;
}

Some other things that may be useful for optimization:

-
I use an Intel Core i7-2620M, based on Sandy Bridge. I don't care about portability at all.

-
I only care about the 16 least significant bits of every int. The other 16 are useless to me and are permanently zeroed out.

-
Even though I copy 20 32-bit ints per memcpy invocation, I only care about the first 17. I have added 3 as it helps with alignment and therefore speed.

-
I use GCC 4.6 on Windows 7.

Any ideas?

UPDATE:

I think this is the assembly output (never done this before, there may be more than you need):

```
memcpyi80:
pushq %r12
.seh_pushreg %r12
pushq %rbp
.seh_pushreg %rbp
pushq %rdi
.seh_pushreg %rdi
pushq %rsi
.seh_pushreg %rsi
pushq %rbx
.seh_pushreg %rbx
.seh_endprologue
movq %rdx, %r9
movq %rcx, %rax
negq %r9
andl $15, %r9d
je .L165
movzbl (%rdx), %ecx
leaq -1(%r9), %r10
movl $79, %esi
andl $7, %r10d
cmpq $1, %r9
movl $79, %ebx
leaq 1(%rdx), %r8
movl $1, %r11d
movb %cl, (%rax)
leaq 1(%rax), %rcx
jbe .L159

Solution

The fastest way to do this would be to align your data on 16-byte boundaries, then the entire copy just becomes 5 copies through XMM registers.

This is over twice as fast as your version on my machine.

Store your data like this:

#include 
struct Data
{
    union
    {
        int i[20];
        __m128 v[5];
    };
};

Then the copy function is just:

void memcpyv5(__m128* __restrict b, const __m128* __restrict a)
{
    __m128 t0 = a[0];
    __m128 t1 = a[1];
    __m128 t2 = a[2];
    __m128 t3 = a[3];
    __m128 t4 = a[4];
    b[0] = t0;
    b[1] = t1;
    b[2] = t2;
    b[3] = t3;
    b[4] = t4;
}

// Example
Data dst, src;
memcpyv5(dst.v, src.v);

Assembly output:

__Z8memcpyv5PU8__vectorfPKS_:
LFB493:
    pushq   %rbp
LCFI2:
    movq    %rsp, %rbp
LCFI3:
    movaps  16(%rsi), %xmm3
    movaps  32(%rsi), %xmm2
    movaps  48(%rsi), %xmm1
    movaps  64(%rsi), %xmm0
    movaps  (%rsi), %xmm4
    movaps  %xmm4, (%rdi)
    movaps  %xmm3, 16(%rdi)
    movaps  %xmm2, 32(%rdi)
    movaps  %xmm1, 48(%rdi)
    movaps  %xmm0, 64(%rdi)
    leave
    ret

Code Snippets

#include <xmmintrin.h>
struct Data
{
    union
    {
        int i[20];
        __m128 v[5];
    };
};

void memcpyv5(__m128* __restrict b, const __m128* __restrict a)
{
    __m128 t0 = a[0];
    __m128 t1 = a[1];
    __m128 t2 = a[2];
    __m128 t3 = a[3];
    __m128 t4 = a[4];
    b[0] = t0;
    b[1] = t1;
    b[2] = t2;
    b[3] = t3;
    b[4] = t4;
}

// Example
Data dst, src;
memcpyv5(dst.v, src.v);

__Z8memcpyv5PU8__vectorfPKS_:
LFB493:
    pushq   %rbp
LCFI2:
    movq    %rsp, %rbp
LCFI3:
    movaps  16(%rsi), %xmm3
    movaps  32(%rsi), %xmm2
    movaps  48(%rsi), %xmm1
    movaps  64(%rsi), %xmm0
    movaps  (%rsi), %xmm4
    movaps  %xmm4, (%rdi)
    movaps  %xmm3, 16(%rdi)
    movaps  %xmm2, 32(%rdi)
    movaps  %xmm1, 48(%rdi)
    movaps  %xmm0, 64(%rdi)
    leave
    ret

Context

StackExchange Code Review Q#5520, answer score: 32

Revisions (0)

No revisions yet.