Gate_neural_network_cpp/opencl_benchmark.cpp at main · eshwanthkartitr/Gate_neural_network_cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#define CL_TARGET_OPENCL_VERSION 120
#include <CL/cl.h>
#include <iostream>
#include <vector>
#include <chrono>
#include <cmath>
#include <random>
#include <iomanip>  // Add this for std::setprecision

const char* kernelSource = R"(
__kernel void vector_add(__global const float* a,
                        __global const float* b,
                        __global float* result,
                        const unsigned int n) {
    int id = get_global_id(0);
    if (id < n) {
        result[id] = a[id] + b[id] + sin(a[id]) * cos(b[id]);
    }
}
)";

class OpenCLBenchmark {
private:
    cl_platform_id platform;
    cl_context context;
    cl_program program;
    cl_kernel kernel;

public:  // Move these to public section
    cl_device_id cpu_device, gpu_device;

    bool initialize() {
        cl_int err;

        // Get platform
        cl_uint platformCount;
        err = clGetPlatformIDs(1, &platform, &platformCount);
        if (err != CL_SUCCESS || platformCount == 0) {
            std::cout << "No OpenCL platforms found!" << std::endl;
            return false;
        }

        // Get devices
        cl_uint deviceCount;
        err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
        if (err != CL_SUCCESS || deviceCount == 0) {
            std::cout << "No OpenCL devices found!" << std::endl;
            return false;
        }

        std::vector<cl_device_id> devices(deviceCount);
        clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, deviceCount, devices.data(), NULL);

        // Find CPU and GPU devices
        bool foundCPU = false, foundGPU = false;
        for (auto device : devices) {
            cl_device_type type;
            clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);

            if (type == CL_DEVICE_TYPE_CPU && !foundCPU) {
                cpu_device = device;
                foundCPU = true;
            } else if (type == CL_DEVICE_TYPE_GPU && !foundGPU) {
                gpu_device = device;
                foundGPU = true;
            }
        }

        if (!foundCPU) {
            std::cout << "No CPU OpenCL device found. This is expected with AMD-only drivers." << std::endl;
            std::cout << "Comparing: Pure CPU (Ryzen) vs GPU (Radeon) performance." << std::endl;
            cpu_device = gpu_device;
        }

        if (!foundGPU) {
            std::cout << "No GPU device found, using CPU for both tests" << std::endl;
            gpu_device = cpu_device;
        }

        return true;
    }

    double runBenchmark(cl_device_id device, const std::vector<float>& a,
                       const std::vector<float>& b, std::vector<float>& result) {

        cl_int err;
        size_t dataSize = a.size();

        // Create context for the specific device
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);

        // Create command queue (use OpenCL 1.2 compatible API)
        cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err);

        // Create program
        program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, &err);
        clBuildProgram(program, 1, &device, NULL, NULL, NULL);

        // Create kernel
        kernel = clCreateKernel(program, "vector_add", &err);

        // Create buffers
        cl_mem bufA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                    sizeof(float) * dataSize, (void*)a.data(), &err);
        cl_mem bufB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                    sizeof(float) * dataSize, (void*)b.data(), &err);
        cl_mem bufResult = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                                         sizeof(float) * dataSize, NULL, &err);

        // Set kernel arguments
        clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufA);
        clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufB);
        clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufResult);
        clSetKernelArg(kernel, 3, sizeof(unsigned int), &dataSize);

        // Execute kernel and measure time
        size_t globalSize = dataSize;

        auto start = std::chrono::high_resolution_clock::now();

        clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, NULL, 0, NULL, NULL);
        clFinish(queue); // Wait for completion

        auto end = std::chrono::high_resolution_clock::now();

        // Read back results
        clEnqueueReadBuffer(queue, bufResult, CL_TRUE, 0,
                           sizeof(float) * dataSize, result.data(), 0, NULL, NULL);

        // Cleanup
        clReleaseMemObject(bufA);
        clReleaseMemObject(bufB);
        clReleaseMemObject(bufResult);
        clReleaseKernel(kernel);
        clReleaseProgram(program);
        clReleaseCommandQueue(queue);
        clReleaseContext(context);

        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
        return duration.count() / 1000.0; // Return milliseconds
    }

    void printDeviceInfo(cl_device_id device, const std::string& label) {
        char deviceName[256] = {0}; // Initialize buffer and make it larger
        cl_int err = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);

        cl_uint computeUnits = 0;
        clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(computeUnits), &computeUnits, NULL);

        cl_ulong globalMemSize = 0;
        clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(globalMemSize), &globalMemSize, NULL);

        cl_uint maxFreq = 0;
        clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(maxFreq), &maxFreq, NULL);

        std::cout << "\n" << label << " Device Info:" << std::endl;
        std::cout << "  Name: " << (err == CL_SUCCESS ? deviceName : "Unknown Device") << std::endl;
        std::cout << "  Compute Units: " << computeUnits << std::endl;
        std::cout << "  Global Memory: " << globalMemSize / (1024*1024) << " MB" << std::endl;
        std::cout << "  Max Frequency: " << maxFreq << " MHz" << std::endl;
    }
};

// CPU reference implementation for comparison
double cpuBenchmark(const std::vector<float>& a, const std::vector<float>& b,
                   std::vector<float>& result) {
    auto start = std::chrono::high_resolution_clock::now();

    for (size_t i = 0; i < a.size(); ++i) {
        result[i] = a[i] + b[i] + sin(a[i]) * cos(b[i]);
    }

    auto end = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
    return duration.count() / 1000.0; // Return milliseconds
}

int main() {
    std::cout << "=== OpenCL CPU vs GPU Benchmark ===" << std::endl;

    // Test with different data sizes
    std::vector<size_t> sizes = {100000, 1000000, 5000000, 10000000};

    OpenCLBenchmark benchmark;
    if (!benchmark.initialize()) {
        std::cerr << "Failed to initialize OpenCL!" << std::endl;
        return -1;
    }

    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_real_distribution<float> dis(-10.0f, 10.0f);

    for (size_t dataSize : sizes) {
        std::cout << "\n" << std::string(50, '=') << std::endl;
        std::cout << "Testing with " << dataSize << " elements" << std::endl;
        std::cout << std::string(50, '=') << std::endl;

        // Generate random data
        std::vector<float> a(dataSize), b(dataSize);
        std::vector<float> cpuResult(dataSize), gpuResult(dataSize), oclCpuResult(dataSize);

        for (size_t i = 0; i < dataSize; ++i) {
            a[i] = dis(gen);
            b[i] = dis(gen);
        }

        // Run CPU reference
        double cpuTime = cpuBenchmark(a, b, cpuResult);

        // Run OpenCL on CPU
        double oclCpuTime = benchmark.runBenchmark(benchmark.cpu_device, a, b, oclCpuResult);

        // Run OpenCL on GPU
        double gpuTime = benchmark.runBenchmark(benchmark.gpu_device, a, b, gpuResult);

        // Print device info for first run
        if (dataSize == sizes[0]) {
            benchmark.printDeviceInfo(benchmark.cpu_device, "CPU");
            benchmark.printDeviceInfo(benchmark.gpu_device, "GPU");
            std::cout << std::endl;
        }

        // Display results
        std::cout << "Performance Results:" << std::endl;
        std::cout << "  Pure CPU (Ryzen):  " << std::fixed << std::setprecision(3)
                  << cpuTime << " ms" << std::endl;
        std::cout << "  GPU (Radeon):      " << gpuTime << " ms" << std::endl;

        std::cout << "\nSpeedup Analysis:" << std::endl;
        std::cout << "  GPU vs CPU:        " << std::fixed << std::setprecision(2)
                  << cpuTime / gpuTime << "x faster" << std::endl;

        // Verify results are correct (check first few elements)
        bool resultsMatch = true;
        for (size_t i = 0; i < std::min(size_t(100), dataSize); ++i) {
            if (std::abs(cpuResult[i] - gpuResult[i]) > 1e-5f) {
                resultsMatch = false;
                break;
            }
        }
        std::cout << "  Result Verification: " << (resultsMatch ? "PASSED" : "FAILED") << std::endl;
    }

    return 0;
}