patterncppMinor
Calculating neuron outputs and derivatives
Viewed 0 times
derivativescalculatingoutputsneuronand
Problem
This function runs very often.
cudaMemcpy is at the start and works very slowly. How can I change this function to avoid this? I already have inputs in device memory.void OpenNNL::calculateNeuronsOutputsAndDerivatives(double * inputs, double * deviceOutputs, double * deviceDerivatives)
{
int inputsCount = _inputsCount;
double * deviceTemp;
double * deviceInputs;
cudaCall(cudaMalloc ( (void**)&deviceInputs, inputsCount*sizeof(double) ));
cudaCall(cudaMemcpy ( deviceInputs, inputs, inputsCount*sizeof(double), cudaMemcpyDeviceToDevice ));
for(int i=0;i>>(deviceTemp, deviceInputs, _neuronsInputsWeights, _inputsInPreviousLayers[i], inputsCount, _neuronsPerLayerCount[i]);
cudaCall(cudaFree(deviceInputs));
cudaCall(cudaMalloc((void**)&deviceInputs, _neuronsPerLayerCount[i]*sizeof(double)));
dim3 threadsSum = dim3(BLOCK_SIZE, 1);
blocksCount = floor((double) _neuronsPerLayerCount[i] / threadsSum.x) + 1;
dim3 blocksSum = dim3(blocksCount, 1);
calculateOutputsAndDerivatives>>(deviceOutputs, deviceDerivatives, deviceInputs, deviceTemp, _neuronsBiases, inputsCount, _neuronsPerLayerCount[i], _neuronsInPreviousLayers[i]);
inputsCount = _neuronsPerLayerCount[i];
cudaCall(cudaFree(deviceTemp));
}
cudaCall(cudaFree(deviceInputs));
}Solution
Try to minimaze memory allocations.
Allocate memory for
And in
Allocate memory for
deviceTemp and deviceInputs only once (in the constructor, for example):cudaCall(cudaMalloc ( (void**)&deviceInputs, some_big_value * sizeof(double) ));
cudaCall(cudaMalloc((void**)&deviceTemp, some_big_value * sizeof(double)));And in
calculateNeuronsOutputsAndDerivatives, reallocate memory only if needed:if (cur_deviceInputs_size < inputsCount)
{
cudaCall(cudaFree(deviceInputs));
cudaCall(cudaMalloc ( (void**)&deviceInputs, inputsCount*sizeof(double) ));
cur_deviceInputs_size = inputsCount;
}Code Snippets
cudaCall(cudaMalloc ( (void**)&deviceInputs, some_big_value * sizeof(double) ));
cudaCall(cudaMalloc((void**)&deviceTemp, some_big_value * sizeof(double)));if (cur_deviceInputs_size < inputsCount)
{
cudaCall(cudaFree(deviceInputs));
cudaCall(cudaMalloc ( (void**)&deviceInputs, inputsCount*sizeof(double) ));
cur_deviceInputs_size = inputsCount;
}Context
StackExchange Code Review Q#18698, answer score: 5
Revisions (0)
No revisions yet.