testsuite/expect/test40.8.prog.cu - SchedMD/slurm - Git at Google

 // Slurm regression test40.8.prog.cu
 #include <iostream>
 #include <math.h>
 #include <sys/time.h>
 // Kernel function to add the elements of two arrays
 __global__
 void add(int n, float *x, float *y)
 {
 	int index = threadIdx.x;
 	int stride = blockDim.x;
 	for (int i = index; i < n; i += stride)
 		y[i] = x[i] + y[i];
 }

 int main(void)
 {
 	int N = 1024 * 1024 * 16;
 	int i;
 	float *x, *y;
 	float maxError = 0.0f;
 	struct timeval tv1, tv2;
 	int delta_t;

 	// Get start time
 	gettimeofday(&tv1, NULL);

 	// Allocate Unified Memory – accessible from CPU or GPU
 	if (cudaMallocManaged(&x, N * sizeof(float)) != cudaSuccess) {
 	        std::cerr << "Couldn't allocate memory for x: " << errno << std::endl;
 		return 1;
 	}
 	if (cudaMallocManaged(&y, N * sizeof(float)) != cudaSuccess) {
 	        std::cerr << "Couldn't allocate memory for y: " << errno << std::endl;
 		return 1;
 	}

 	// initialize x and y arrays on the host
 	for (i = 0; i < N; i++) {
 		x[i] = 1.0f;
 		y[i] = 2.0f;
 	}

 	// Run kernel on 256 elements at a time on the GPU
 	add<<<1, 256>>>(N, x, y);

 	// Wait for GPU to finish before accessing on host
 	cudaDeviceSynchronize();

 	// Check for errors (all values should be 3.0f)
 	for (i = 0; i < N; i++)
 		maxError = fmax(maxError, fabs(y[i] - 3.0f));
 	std::cout << "Max error: " << maxError << std::endl;

 	// Free memory
 	cudaFree(x);
 	cudaFree(y);

 	// Get start time
 	gettimeofday(&tv2, NULL);
 	delta_t  = (tv2.tv_sec  - tv1.tv_sec) * 1000000;
 	delta_t += (tv2.tv_usec - tv1.tv_usec);
 	std::cout << "Run Time (usec): " << delta_t << std::endl;

 	return 0;
 }
	// Slurm regression test40.8.prog.cu
	#include <iostream>
	#include <math.h>
	#include <sys/time.h>
	// Kernel function to add the elements of two arrays
	__global__
	void add(int n, float x, float y)
	{
	int index = threadIdx.x;
	int stride = blockDim.x;
	for (int i = index; i < n; i += stride)
	y[i] = x[i] + y[i];
	}

	int main(void)
	{
	int N = 1024 * 1024 * 16;
	int i;
	float x, y;
	float maxError = 0.0f;
	struct timeval tv1, tv2;
	int delta_t;

	// Get start time
	gettimeofday(&tv1, NULL);

	// Allocate Unified Memory – accessible from CPU or GPU
	if (cudaMallocManaged(&x, N * sizeof(float)) != cudaSuccess) {
	std::cerr << "Couldn't allocate memory for x: " << errno << std::endl;
	return 1;
	}
	if (cudaMallocManaged(&y, N * sizeof(float)) != cudaSuccess) {
	std::cerr << "Couldn't allocate memory for y: " << errno << std::endl;
	return 1;
	}

	// initialize x and y arrays on the host
	for (i = 0; i < N; i++) {
	x[i] = 1.0f;
	y[i] = 2.0f;
	}

	// Run kernel on 256 elements at a time on the GPU
	add<<<1, 256>>>(N, x, y);

	// Wait for GPU to finish before accessing on host
	cudaDeviceSynchronize();

	// Check for errors (all values should be 3.0f)
	for (i = 0; i < N; i++)
	maxError = fmax(maxError, fabs(y[i] - 3.0f));
	std::cout << "Max error: " << maxError << std::endl;

	// Free memory
	cudaFree(x);
	cudaFree(y);

	// Get start time
	gettimeofday(&tv2, NULL);
	delta_t = (tv2.tv_sec - tv1.tv_sec) * 1000000;
	delta_t += (tv2.tv_usec - tv1.tv_usec);
	std::cout << "Run Time (usec): " << delta_t << std::endl;

	return 0;
	}