/Cudafy/CudafyByExample/chapter10/basic_double_stream_correct.cs
C# | 131 lines | 102 code | 17 blank | 12 comment | 6 complexity | b9b821dbafbe558f1c41695bb529044c MD5 | raw file
- /*
- * This software is based upon the book CUDA By Example by Sanders and Kandrot
- * and source code provided by NVIDIA Corporation.
- * It is a good idea to read the book while studying the examples!
- */
- using System;
- using System.Collections.Generic;
- using System.Linq;
- using System.Text;
- using Cudafy;
- using Cudafy.Host;
- using Cudafy.Translator;
-
- namespace CudafyByExample
- {
- public class basic_double_stream_correct
- {
- public const int N = (1024*1024);
- public const int FULL_DATA_SIZE = (N*20);
-
- [Cudafy]
- public static void thekernel(GThread thread, int[] a, int[] b, int[] c)
- {
- int idx = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x;
- if (idx < N)
- {
- int idx1 = (idx + 1) % 256;
- int idx2 = (idx + 2) % 256;
- float aS = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
- float bS = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
- c[idx] = (int)(aS + bS) / 2;
- }
- }
-
-
- public static void Execute()
- {
- CudafyModule km = CudafyTranslator.Cudafy();
-
- GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);
- gpu.LoadModule(km);
-
- int[] dev_a0, dev_b0, dev_c0;
- int[] dev_a1, dev_b1, dev_c1;
-
- // allocate the memory on the GPU
- dev_a0 = gpu.Allocate<int>(N);
- dev_b0 = gpu.Allocate<int>(N);
- dev_c0 = gpu.Allocate<int>(N);
- dev_a1 = gpu.Allocate<int>(N);
- dev_b1 = gpu.Allocate<int>(N);
- dev_c1 = gpu.Allocate<int>(N);
-
- // allocate host locked memory, used to stream
- IntPtr host_aPtr = gpu.HostAllocate<int>(FULL_DATA_SIZE);
- IntPtr host_bPtr = gpu.HostAllocate<int>(FULL_DATA_SIZE);
- IntPtr host_cPtr = gpu.HostAllocate<int>(FULL_DATA_SIZE);
-
- Random rand = new Random();
- for (int i = 0; i < FULL_DATA_SIZE; i++)
- {
- host_aPtr.Set(i, rand.Next(1024 * 1024)); // There will be differences between the .NET code and the GPU
- host_bPtr.Set(i, rand.Next(1024 * 1024)); // So let's keep these to a minimum by having a max random values.
- }
-
- // start timer
- gpu.StartTimer();
-
- // now loop over full data, in bite-sized chunks
- for (int i = 0; i < FULL_DATA_SIZE; i += N * 2)
- {
- gpu.CopyToDeviceAsync(host_aPtr, i, dev_a0, 0, N, 1);
- gpu.CopyToDeviceAsync(host_bPtr, i, dev_b0, 0, N, 2);
- gpu.CopyToDeviceAsync(host_aPtr, i + N, dev_a1, 0, N, 1);
- gpu.CopyToDeviceAsync(host_bPtr, i + N, dev_b1, 0, N, 2);
- gpu.LaunchAsync(N / 256, 256, 1, "thekernel", dev_a0, dev_b0, dev_c0);
- gpu.LaunchAsync(N / 256, 256, 2, "thekernel", dev_a1, dev_b1, dev_c1);
- //gpu.Launch(N / 256, 256, 1).kernel(dev_a0, dev_b0, dev_c0);
- //gpu.Launch(N / 256, 256, 2).kernel(dev_a1, dev_b1, dev_c1);
- gpu.CopyFromDeviceAsync(dev_c0, 0, host_cPtr, i, N, 1);
- gpu.CopyFromDeviceAsync(dev_c1, 0, host_cPtr, i + N, N, 2);
- }
- gpu.SynchronizeStream(1);
- gpu.SynchronizeStream(2);
-
- float elapsed = gpu.StopTimer();
-
- // verify
- int[] host_a = new int[FULL_DATA_SIZE];
- int[] host_b = new int[FULL_DATA_SIZE];
- int[] host_c = new int[FULL_DATA_SIZE];
-
- GPGPU.CopyOnHost(host_aPtr, 0, host_a, 0, FULL_DATA_SIZE);
- GPGPU.CopyOnHost(host_bPtr, 0, host_b, 0, FULL_DATA_SIZE);
- GPGPU.CopyOnHost(host_cPtr, 0, host_c, 0, FULL_DATA_SIZE);
- Console.WriteLine("Elapsed: {0} ms", elapsed);
-
- int[] host_d = new int[FULL_DATA_SIZE];
- int errors = 0;
- int id = 0;
- {
- for (int j = 0; j < N; j++, id++)
- {
- control(id, j, host_a, host_b, host_d);
- if (host_c[id] > host_d[id] + 1) // There will be differences between the .NET code and the GPU
- {
- Console.WriteLine("Mismatch at {0}: {1} != {2}", id, host_c[id], host_d[id]);
- errors++;
- if (errors > 8)
- break;
- }
- }
- }
-
- gpu.HostFree(host_aPtr);
- gpu.HostFree(host_bPtr);
- gpu.HostFree(host_cPtr);
- gpu.DestroyStream(1);
- gpu.DestroyStream(2);
- }
-
- public static void control(int idx, int jdx, int[] a, int[] b, int[] c)
- {
- int idx1 = idx/N + (jdx + 1) % 256;
- int idx2 = idx/N + (jdx + 2) % 256;
- float aS = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
- float bS = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
- c[idx] = (int)(aS + bS) / 2;
- }
- }
- }