patterncppMinor
Calculating sum of primes using the CPU and GPU
Viewed 0 times
theprimesgpucalculatingusingsumandcpu
Problem
This is a little baffling to me as to why the CUDA code runs about twice as slow as the CPU version. I am just counting all the primes from 0 to (512 512 512). The CPU version executed in about 97 seconds whereas the GPU version took 182 seconds.
Any ideas why?
CPU version
CUDA version
- CPU: Intel Core i7 @ 4 GHz
- GPU: NVIDIA GTX 960
Any ideas why?
#include
#include
#include
#include
#include
#include
#include
#include
using namespace std;
__host__ __device__ bool is_prime(uint32_t n)
{
if(n == 2)
return true;
if(n % 2 == 0)
return false;
uint32_t sr = sqrtf(n);
for(uint32_t i = 3; i <= sr; i += 2)
if(n % i == 0)
return false;
return true;
}
__global__ void prime_sum(unsigned int* count)
{
uint32_t n = (blockIdx.y * gridDim.y + blockIdx.x) * blockDim.x + threadIdx.x;
if(is_prime(n))
atomicAdd(count, 1);
}CPU version
int main()
{
time_t start = time(0);
int pcount = 0;
for(uint32_t i = 0; i < (512 * 512 * 512); i++)
{
if(is_prime(i)) pcount++;
}
start = time(0) - start;
std::cout << pcount << "\t" << start << std::endl;
return 0;
}CUDA version
int main()
{
time_t start = time(0);
unsigned int* sum_d;
cudaMalloc(&sum_d, sizeof(unsigned int));
cudaMemset(sum_d, 0, sizeof(unsigned int));
prime_sum>>(sum_d);
unsigned int sum = 0;
cudaMemcpy(&sum, sum_d, sizeof(unsigned int), cudaMemcpyDeviceToHost);
start = time(0) - start;
std::cout << sum << "\t" << start << std::endl;
cudaFree(sum_d);
return 0;
}Solution
-
Try not to use
-
This is lacking in curly braces and could be bad for maintenance:
It's a good idea to still use them for single-line statements, and you should especially use them with
Here's what it should look like:
-
Consider finding an alternative to
-
One source of slower performance may be the
-
This line seems confusing:
It appears that the lvalue should instead be named
Try not to use
using namespace std.-
This is lacking in curly braces and could be bad for maintenance:
if(n == 2)
return true;
if(n % 2 == 0)
return false;
uint32_t sr = sqrtf(n);
for(uint32_t i = 3; i <= sr; i += 2)
if(n % i == 0)
return false;
return true;It's a good idea to still use them for single-line statements, and you should especially use them with
for loops. Some extra whitespace in places could be useful as well.Here's what it should look like:
if (n == 2)
{
return true;
}
if (n % 2 == 0)
{
return false;
}
uint32_t sr = sqrtf(n);
for (uint32_t i = 3; i <= sr; i += 2)
{
if (n % i == 0)
{
return false;
}
}
return true;-
Consider finding an alternative to
sqrtf() as it can be slow. It may not make a huge difference in overall runtime, though. Fortunately, it appears to be thread-safe.-
One source of slower performance may be the
if statement in prime_sum(). Doing so may cause thread divergence, if threads end up executing different instructions. You'll have to analyze your code to determine this for sure.-
This line seems confusing:
start = time(0) - start;It appears that the lvalue should instead be named
end since you already calculate a starting time beforehand.Code Snippets
if(n == 2)
return true;
if(n % 2 == 0)
return false;
uint32_t sr = sqrtf(n);
for(uint32_t i = 3; i <= sr; i += 2)
if(n % i == 0)
return false;
return true;if (n == 2)
{
return true;
}
if (n % 2 == 0)
{
return false;
}
uint32_t sr = sqrtf(n);
for (uint32_t i = 3; i <= sr; i += 2)
{
if (n % i == 0)
{
return false;
}
}
return true;start = time(0) - start;Context
StackExchange Code Review Q#100382, answer score: 3
Revisions (0)
No revisions yet.