patterncppMinor
Atomic floating-point addition
Viewed 0 times
floatingadditionpointatomic
Problem
I need to be able to atomically add a 32-bit floating point value to a location in memory. Here is what I came up with. While the code is Windows-specific, I'll extend it with Linux support using
While I tested this code and it appeared to work fine, I'll appreciate a fresh look from another pair of eyes to make sure this code is 100% safe.
I'm also very much interested in any performance advice as this code is used in a hot code path in a performance-sensitive application (interactive rendering).
Assembly output for
```
lorg$ = 8
ptr$ = 8
lnew$ = 16
operand$ = 16
?atomic_float_add@?A0x1534477b@foundation@@YAXPECMM@Z PROC
prefetchw BYTE PTR [rcx]
npad 13
$LL3@atomic_flo:
movss xmm0, DWORD PTR [rcx]
movss DWORD PTR lorg$[rsp], xmm0
addss
__sync_bool_compare_and_swap().While I tested this code and it appeared to work fine, I'll appreciate a fresh look from another pair of eyes to make sure this code is 100% safe.
I'm also very much interested in any performance advice as this code is used in a hot code path in a performance-sensitive application (interactive rendering).
atomic_float_add(), the main function:__forceinline void atomic_float_add(volatile float* ptr, const float operand)
{
assert(is_aligned(ptr, 4));
volatile LONG* lptr = reinterpret_cast(ptr);
LONG lorg, lnew;
do
{
const float forg = *ptr;
const float fnew = forg + operand;
lorg = binary_cast(forg);
lnew = binary_cast(fnew);
} while (InterlockedCompareExchange(lptr, lnew, lorg) != lorg);
}binary_cast() is implemented in such a way that it obeys strict aliasing rules:template
inline Target binary_cast(Source s)
{
BOOST_STATIC_ASSERT(sizeof(Target) == sizeof(Source));
union
{
Source m_source;
Target m_target;
} u;
u.m_source = s;
return u.m_target;
}is_aligned():template
inline bool is_aligned(const T ptr, const size_t alignment)
{
assert(alignment > 0);
assert(is_pow2(alignment));
const uintptr_t p = (uintptr_t)ptr;
return (p & (alignment - 1)) == 0;
}is_pow2():template
inline bool is_pow2(const T x)
{
return (x & (x - 1)) == 0;
}Assembly output for
atomic_float_add() (Visual Studio 2013, full optimization):```
lorg$ = 8
ptr$ = 8
lnew$ = 16
operand$ = 16
?atomic_float_add@?A0x1534477b@foundation@@YAXPECMM@Z PROC
prefetchw BYTE PTR [rcx]
npad 13
$LL3@atomic_flo:
movss xmm0, DWORD PTR [rcx]
movss DWORD PTR lorg$[rsp], xmm0
addss
Solution
@EOF commented that
Here is the new version with this optimization:
Here is the corresponding assembly:
Interestingly, the compiler decided to do one iteration before entering the loop. I'm not sure I understand the benefit of this...
InterlockedCompareExchange (and its gcc counterpart, __sync_val_compare_and_swap) returns the initial value of the destination address. This allows to remove one memory load from the retry loop.Here is the new version with this optimization:
__forceinline void atomic_float_add(volatile float* ptr, const float operand)
{
assert(is_aligned(ptr, 4));
volatile LONG* iptr = reinterpret_cast(ptr);
LONG expected = *iptr;
while (true)
{
const float value = binary_cast(expected);
const LONG new_value = binary_cast(value + operand);
const LONG actual = InterlockedCompareExchange(iptr, new_value, expected);
if (actual == expected)
return;
expected = actual;
}
}Here is the corresponding assembly:
?atomic_float_add@foundation@@YAXPECMM@Z PROC
mov eax, DWORD PTR [rcx]
mov r8, rcx
mov DWORD PTR value$2[rsp], eax
movss xmm0, DWORD PTR value$2[rsp]
addss xmm0, xmm1
movss DWORD PTR new_value$1[rsp], xmm0
mov edx, DWORD PTR new_value$1[rsp]
lock cmpxchg DWORD PTR [rcx], edx
je SHORT $LN16@atomic_float_add
npad 13
$LL3@atomic_float_add:
mov DWORD PTR value$2[rsp], eax
mov edx, eax
movss xmm0, DWORD PTR value$2[rsp]
addss xmm0, xmm1
movss DWORD PTR new_value$1[rsp], xmm0
mov ecx, DWORD PTR new_value$1[rsp]
lock cmpxchg DWORD PTR [r8], ecx
cmp eax, edx
jne SHORT $LL3@atomic_float_add
$LN16@atomic_float_add:
ret 0
?atomic_float_add@foundation@@YAXPECMM@Z ENDPInterestingly, the compiler decided to do one iteration before entering the loop. I'm not sure I understand the benefit of this...
Code Snippets
__forceinline void atomic_float_add(volatile float* ptr, const float operand)
{
assert(is_aligned(ptr, 4));
volatile LONG* iptr = reinterpret_cast<volatile LONG*>(ptr);
LONG expected = *iptr;
while (true)
{
const float value = binary_cast<float>(expected);
const LONG new_value = binary_cast<LONG>(value + operand);
const LONG actual = InterlockedCompareExchange(iptr, new_value, expected);
if (actual == expected)
return;
expected = actual;
}
}?atomic_float_add@foundation@@YAXPECMM@Z PROC
mov eax, DWORD PTR [rcx]
mov r8, rcx
mov DWORD PTR value$2[rsp], eax
movss xmm0, DWORD PTR value$2[rsp]
addss xmm0, xmm1
movss DWORD PTR new_value$1[rsp], xmm0
mov edx, DWORD PTR new_value$1[rsp]
lock cmpxchg DWORD PTR [rcx], edx
je SHORT $LN16@atomic_float_add
npad 13
$LL3@atomic_float_add:
mov DWORD PTR value$2[rsp], eax
mov edx, eax
movss xmm0, DWORD PTR value$2[rsp]
addss xmm0, xmm1
movss DWORD PTR new_value$1[rsp], xmm0
mov ecx, DWORD PTR new_value$1[rsp]
lock cmpxchg DWORD PTR [r8], ecx
cmp eax, edx
jne SHORT $LL3@atomic_float_add
$LN16@atomic_float_add:
ret 0
?atomic_float_add@foundation@@YAXPECMM@Z ENDPContext
StackExchange Code Review Q#135852, answer score: 2
Revisions (0)
No revisions yet.