patterncMinor
Vectorizing the product of an array of complex numbers
Viewed 0 times
thearraynumbersproductvectorizingcomplex
Problem
I am trying to write fast/optimal code to vectorize the product of an array of complex numbers. In simple C this would be:
However, gcc can't vectorize this and my target CPU is the AMD FX-8350 which supports AVX. To speed it up I have tried:
Can this code be improved for AVX and is it as fast as it can get?
#include
complex float f(complex float x[], int n ) {
complex float p = 1.0;
for (int i = 0; i < n; i++)
p *= x[i];
return p;
}However, gcc can't vectorize this and my target CPU is the AMD FX-8350 which supports AVX. To speed it up I have tried:
typedef float v4sf __attribute__ ((vector_size (16)));
typedef union {
v4sf v;
float e[4];
} float4;
typedef struct {
float4 x;
float4 y;
} complex4;
static complex4 complex4_mul(complex4 a, complex4 b) {
return (complex4){a.x.v*b.x.v -a.y.v*b.y.v, a.y.v*b.x.v + a.x.v*b.y.v};
}
complex4 f4(complex4 x[], int n) {
v4sf one = {1,1,1,1};
complex4 p = {one,one};
for (int i = 0; i < n; i++) p = complex4_mul(p, x[i]);
return p;
}Can this code be improved for AVX and is it as fast as it can get?
Solution
I was wondering if the compiler would more easily be able to vectorize it without the
How about this?
Looking at the assembly on https://godbolt.org/ it seems it gets fully vectorized. I can't get the godbolt sharing links to work.
I'm thinking about whether it might be possible to stick both the real and imaginary parts in the same vector, and use
union-within-struct and the potential type punning of the e member (which doesn't seem to be used).How about this?
typedef float v4sf __attribute__ ((vector_size (16)));
typedef struct {
v4sf x;
v4sf y;
} complex4;
static inline v4sf complex4_mul_r(v4sf a_r, v4sf a_i, v4sf b_r, v4sf b_i) {
return a_r*b_r -a_i*b_i;
}
static inline v4sf complex4_mul_i(v4sf a_r, v4sf a_i, v4sf b_r, v4sf b_i) {
return a_r*b_i + a_i*b_r;
}
complex4 f4(v4sf x_r[], v4sf x_i[], int n) {
v4sf one = {1,1,1,1};
v4sf p_r = one;
v4sf p_i = one;
v4sf p_r_temp;
for (int i = 0; i < n; i++)
{
p_r_temp = complex4_mul_r(p_r, p_i, x_r[i], x_i[i]);
p_i = complex4_mul_i(p_r, p_i, x_r[i], x_i[i]);
p_r = p_r_temp;
}
return (complex4){p_r, p_i};
}Looking at the assembly on https://godbolt.org/ it seems it gets fully vectorized. I can't get the godbolt sharing links to work.
I'm thinking about whether it might be possible to stick both the real and imaginary parts in the same vector, and use
__builtin_shuffle() to permute them as necessary. Can't quite work it out.Code Snippets
typedef float v4sf __attribute__ ((vector_size (16)));
typedef struct {
v4sf x;
v4sf y;
} complex4;
static inline v4sf complex4_mul_r(v4sf a_r, v4sf a_i, v4sf b_r, v4sf b_i) {
return a_r*b_r -a_i*b_i;
}
static inline v4sf complex4_mul_i(v4sf a_r, v4sf a_i, v4sf b_r, v4sf b_i) {
return a_r*b_i + a_i*b_r;
}
complex4 f4(v4sf x_r[], v4sf x_i[], int n) {
v4sf one = {1,1,1,1};
v4sf p_r = one;
v4sf p_i = one;
v4sf p_r_temp;
for (int i = 0; i < n; i++)
{
p_r_temp = complex4_mul_r(p_r, p_i, x_r[i], x_i[i]);
p_i = complex4_mul_i(p_r, p_i, x_r[i], x_i[i]);
p_r = p_r_temp;
}
return (complex4){p_r, p_i};
}Context
StackExchange Code Review Q#152759, answer score: 2
Revisions (0)
No revisions yet.