patterncMajor
Copying 80 bytes as fast as possible
Viewed 0 times
fastpossiblecopyingbytes
Problem
I am running a math-oriented computation that spends a significant amount of its time doing
By using the
Some other things that may be useful for optimization:
-
I use an Intel Core i7-2620M, based on Sandy Bridge. I don't care about portability at all.
-
I only care about the 16 least significant bits of every int. The other 16 are useless to me and are permanently zeroed out.
-
Even though I copy 20 32-bit ints per memcpy invocation, I only care about the first 17. I have added 3 as it helps with alignment and therefore speed.
-
I use GCC 4.6 on Windows 7.
Any ideas?
UPDATE:
I think this is the assembly output (never done this before, there may be more than you need):
```
memcpyi80:
pushq %r12
.seh_pushreg %r12
pushq %rbp
.seh_pushreg %rbp
pushq %rdi
.seh_pushreg %rdi
pushq %rsi
.seh_pushreg %rsi
pushq %rbx
.seh_pushreg %rbx
.seh_endprologue
movq %rdx, %r9
movq %rcx, %rax
negq %r9
andl $15, %r9d
je .L165
movzbl (%rdx), %ecx
leaq -1(%r9), %r10
movl $79, %esi
andl $7, %r10d
cmpq $1, %r9
movl $79, %ebx
leaq 1(%rdx), %r8
movl $1, %r11d
movb %cl, (%rax)
leaq 1(%rax), %rcx
jbe .L159
memcpy, always copying 80 bytes from one location to the next, an array of 20 32-bit ints. The total computation takes around 4-5 days using both cores of my i7, so even a 1% speedup results in about an hour saved.By using the
memcpy in this paper by Intel, I was able to speed up by about 25%, and also dropping the size argument and simply declaring inside seems to have some small effect. However, I feel I am not utilising the fact that my copying operations are always the same size. That said, I can't come up with a better way.void *memcpyi80(void* __restrict b, const void* __restrict a){
size_t n = 80;
char *s1 = b;
const char *s2 = a;
for(; 0<n; --n)*s1++ = *s2++;
return b;
}Some other things that may be useful for optimization:
-
I use an Intel Core i7-2620M, based on Sandy Bridge. I don't care about portability at all.
-
I only care about the 16 least significant bits of every int. The other 16 are useless to me and are permanently zeroed out.
-
Even though I copy 20 32-bit ints per memcpy invocation, I only care about the first 17. I have added 3 as it helps with alignment and therefore speed.
-
I use GCC 4.6 on Windows 7.
Any ideas?
UPDATE:
I think this is the assembly output (never done this before, there may be more than you need):
```
memcpyi80:
pushq %r12
.seh_pushreg %r12
pushq %rbp
.seh_pushreg %rbp
pushq %rdi
.seh_pushreg %rdi
pushq %rsi
.seh_pushreg %rsi
pushq %rbx
.seh_pushreg %rbx
.seh_endprologue
movq %rdx, %r9
movq %rcx, %rax
negq %r9
andl $15, %r9d
je .L165
movzbl (%rdx), %ecx
leaq -1(%r9), %r10
movl $79, %esi
andl $7, %r10d
cmpq $1, %r9
movl $79, %ebx
leaq 1(%rdx), %r8
movl $1, %r11d
movb %cl, (%rax)
leaq 1(%rax), %rcx
jbe .L159
Solution
The fastest way to do this would be to align your data on 16-byte boundaries, then the entire copy just becomes 5 copies through XMM registers.
This is over twice as fast as your version on my machine.
Store your data like this:
Then the copy function is just:
Assembly output:
This is over twice as fast as your version on my machine.
Store your data like this:
#include
struct Data
{
union
{
int i[20];
__m128 v[5];
};
};Then the copy function is just:
void memcpyv5(__m128* __restrict b, const __m128* __restrict a)
{
__m128 t0 = a[0];
__m128 t1 = a[1];
__m128 t2 = a[2];
__m128 t3 = a[3];
__m128 t4 = a[4];
b[0] = t0;
b[1] = t1;
b[2] = t2;
b[3] = t3;
b[4] = t4;
}
// Example
Data dst, src;
memcpyv5(dst.v, src.v);Assembly output:
__Z8memcpyv5PU8__vectorfPKS_:
LFB493:
pushq %rbp
LCFI2:
movq %rsp, %rbp
LCFI3:
movaps 16(%rsi), %xmm3
movaps 32(%rsi), %xmm2
movaps 48(%rsi), %xmm1
movaps 64(%rsi), %xmm0
movaps (%rsi), %xmm4
movaps %xmm4, (%rdi)
movaps %xmm3, 16(%rdi)
movaps %xmm2, 32(%rdi)
movaps %xmm1, 48(%rdi)
movaps %xmm0, 64(%rdi)
leave
retCode Snippets
#include <xmmintrin.h>
struct Data
{
union
{
int i[20];
__m128 v[5];
};
};void memcpyv5(__m128* __restrict b, const __m128* __restrict a)
{
__m128 t0 = a[0];
__m128 t1 = a[1];
__m128 t2 = a[2];
__m128 t3 = a[3];
__m128 t4 = a[4];
b[0] = t0;
b[1] = t1;
b[2] = t2;
b[3] = t3;
b[4] = t4;
}
// Example
Data dst, src;
memcpyv5(dst.v, src.v);__Z8memcpyv5PU8__vectorfPKS_:
LFB493:
pushq %rbp
LCFI2:
movq %rsp, %rbp
LCFI3:
movaps 16(%rsi), %xmm3
movaps 32(%rsi), %xmm2
movaps 48(%rsi), %xmm1
movaps 64(%rsi), %xmm0
movaps (%rsi), %xmm4
movaps %xmm4, (%rdi)
movaps %xmm3, 16(%rdi)
movaps %xmm2, 32(%rdi)
movaps %xmm1, 48(%rdi)
movaps %xmm0, 64(%rdi)
leave
retContext
StackExchange Code Review Q#5520, answer score: 32
Revisions (0)
No revisions yet.