patterncppMinor

Atomic floating-point addition

Submitted by: @import:stackexchange-codereview·Mar 10, 2026·

Viewed 0 times

codereview cpp c++stackoverflow floating-point performance multithreading atomic

floatingadditionpointatomic

Problem

I need to be able to atomically add a 32-bit floating point value to a location in memory. Here is what I came up with. While the code is Windows-specific, I'll extend it with Linux support using __sync_bool_compare_and_swap().

While I tested this code and it appeared to work fine, I'll appreciate a fresh look from another pair of eyes to make sure this code is 100% safe.

I'm also very much interested in any performance advice as this code is used in a hot code path in a performance-sensitive application (interactive rendering).

atomic_float_add(), the main function:

__forceinline void atomic_float_add(volatile float* ptr, const float operand)
{
    assert(is_aligned(ptr, 4));

    volatile LONG* lptr = reinterpret_cast(ptr);
    LONG lorg, lnew;

    do
    {
        const float forg = *ptr;
        const float fnew = forg + operand;
        lorg = binary_cast(forg);
        lnew = binary_cast(fnew);
    } while (InterlockedCompareExchange(lptr, lnew, lorg) != lorg);
}

binary_cast() is implemented in such a way that it obeys strict aliasing rules:

template 
inline Target binary_cast(Source s)
{
    BOOST_STATIC_ASSERT(sizeof(Target) == sizeof(Source));

    union
    {
        Source  m_source;
        Target  m_target;
    } u;

    u.m_source = s;
    return u.m_target;
}

is_aligned():

template 
inline bool is_aligned(const T ptr, const size_t alignment)
{
    assert(alignment > 0);
    assert(is_pow2(alignment));

    const uintptr_t p = (uintptr_t)ptr;
    return (p & (alignment - 1)) == 0;
}

is_pow2():

template 
inline bool is_pow2(const T x)
{
    return (x & (x - 1)) == 0;
}

Assembly output for atomic_float_add() (Visual Studio 2013, full optimization):

```
lorg$ = 8
ptr$ = 8
lnew$ = 16
operand$ = 16
?atomic_float_add@?A0x1534477b@foundation@@YAXPECMM@Z PROC

prefetchw BYTE PTR [rcx]
npad 13

$LL3@atomic_flo:
movss xmm0, DWORD PTR [rcx]
movss DWORD PTR lorg$[rsp], xmm0
addss

Solution

@EOF commented that InterlockedCompareExchange (and its gcc counterpart, __sync_val_compare_and_swap) returns the initial value of the destination address. This allows to remove one memory load from the retry loop.

Here is the new version with this optimization:

__forceinline void atomic_float_add(volatile float* ptr, const float operand)
{
    assert(is_aligned(ptr, 4));

    volatile LONG* iptr = reinterpret_cast(ptr);
    LONG expected = *iptr;

    while (true)
    {
        const float value = binary_cast(expected);
        const LONG new_value = binary_cast(value + operand);
        const LONG actual = InterlockedCompareExchange(iptr, new_value, expected);
        if (actual == expected)
            return;
        expected = actual;
    }
}

Here is the corresponding assembly:

?atomic_float_add@foundation@@YAXPECMM@Z PROC

    mov eax, DWORD PTR [rcx]
    mov r8, rcx
    mov DWORD PTR value$2[rsp], eax
    movss   xmm0, DWORD PTR value$2[rsp]
    addss   xmm0, xmm1
    movss   DWORD PTR new_value$1[rsp], xmm0
    mov edx, DWORD PTR new_value$1[rsp]
    lock cmpxchg DWORD PTR [rcx], edx
    je  SHORT $LN16@atomic_float_add
    npad    13

$LL3@atomic_float_add:
    mov DWORD PTR value$2[rsp], eax
    mov edx, eax
    movss   xmm0, DWORD PTR value$2[rsp]
    addss   xmm0, xmm1
    movss   DWORD PTR new_value$1[rsp], xmm0
    mov ecx, DWORD PTR new_value$1[rsp]
    lock cmpxchg DWORD PTR [r8], ecx
    cmp eax, edx
    jne SHORT $LL3@atomic_float_add

$LN16@atomic_float_add:
    ret 0

?atomic_float_add@foundation@@YAXPECMM@Z ENDP

Interestingly, the compiler decided to do one iteration before entering the loop. I'm not sure I understand the benefit of this...

Code Snippets

__forceinline void atomic_float_add(volatile float* ptr, const float operand)
{
    assert(is_aligned(ptr, 4));

    volatile LONG* iptr = reinterpret_cast<volatile LONG*>(ptr);
    LONG expected = *iptr;

    while (true)
    {
        const float value = binary_cast<float>(expected);
        const LONG new_value = binary_cast<LONG>(value + operand);
        const LONG actual = InterlockedCompareExchange(iptr, new_value, expected);
        if (actual == expected)
            return;
        expected = actual;
    }
}

?atomic_float_add@foundation@@YAXPECMM@Z PROC

    mov eax, DWORD PTR [rcx]
    mov r8, rcx
    mov DWORD PTR value$2[rsp], eax
    movss   xmm0, DWORD PTR value$2[rsp]
    addss   xmm0, xmm1
    movss   DWORD PTR new_value$1[rsp], xmm0
    mov edx, DWORD PTR new_value$1[rsp]
    lock cmpxchg DWORD PTR [rcx], edx
    je  SHORT $LN16@atomic_float_add
    npad    13

$LL3@atomic_float_add:
    mov DWORD PTR value$2[rsp], eax
    mov edx, eax
    movss   xmm0, DWORD PTR value$2[rsp]
    addss   xmm0, xmm1
    movss   DWORD PTR new_value$1[rsp], xmm0
    mov ecx, DWORD PTR new_value$1[rsp]
    lock cmpxchg DWORD PTR [r8], ecx
    cmp eax, edx
    jne SHORT $LL3@atomic_float_add

$LN16@atomic_float_add:
    ret 0

?atomic_float_add@foundation@@YAXPECMM@Z ENDP

Context

StackExchange Code Review Q#135852, answer score: 2

Revisions (0)

No revisions yet.