好奇的探索者,理性的思考者,踏实的行动者。
Table of Contents:
OpenCL 和 CUDA 是两种并行计算平台,都可以用于利用 GPU 进行并行计算,但它们有一些区别:
开发者和生态系统:
厂商支持:
编程语言:
平台支持:
应用领域:
以下示例使用 CUDA 编写了一个向量加法程序,通过在 GPU 上并行计算两个向量的相加,从而加速了向量加法操作。
#include <stdio.h>
// CUDA 核函数,用于将两个向量相加
void addVectors(int *a, int *b, int *c, int n) {
__global__ int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) {
c[tid] = a[tid] + b[tid];
}
}
int main() {
int n = 1000;
int a[n], b[n], c[n];
int *dev_a, *dev_b, *dev_c;
// 分配内存并初始化数据
void**)&dev_a, n * sizeof(int));
cudaMalloc((void**)&dev_b, n * sizeof(int));
cudaMalloc((void**)&dev_c, n * sizeof(int));
cudaMalloc((
for (int i = 0; i < n; ++i) {
a[i] = i;2;
b[i] = i *
}
// 将数据从主机内存复制到设备内存
sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_a, a, n * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, n *
// 调用 CUDA 核函数
255) / 256, 256>>>(dev_a, dev_b, dev_c, n);
addVectors<<<(n +
// 将结果从设备内存复制回主机内存
sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(c, dev_c, n *
// 打印结果
for (int i = 0; i < n; ++i) {
"%d + %d = %d\n", a[i], b[i], c[i]);
printf(
}
// 释放设备内存
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
以下示例使用 OpenCL 编写了一个向量加法程序,同样通过在 GPU 上并行计算两个向量的相加,从而加速了向量加法操作。
#include <stdio.h>
#include <CL/cl.h>
#define N 1000
const char *kernelSource =
"__kernel void addVectors(__global int *a, __global int *b, __global int *c) { \n"
" int tid = get_global_id(0); \n"
" if (tid < N) { \n"
" c[tid] = a[tid] + b[tid]; \n"
" } \n"
"} \n";
int main() {
int a[N], b[N], c[N];
// 初始化数据
for (int i = 0; i < N; ++i) {
a[i] = i;2;
b[i] = i *
}
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
cl_mem bufferA, bufferB, bufferC;
// 创建 OpenCL 上下文和命令队列
1, &platform, NULL);
clGetPlatformIDs(1, &device, NULL);
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL, NULL, NULL);
context = clCreateContext(NULL, 0, NULL);
queue = clCreateCommandQueue(context, device,
// 创建缓冲区对象并复制数据
sizeof(int), NULL, NULL);
bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY, N * sizeof(int), NULL, NULL);
bufferB = clCreateBuffer(context, CL_MEM_READ_ONLY, N * sizeof(int), NULL, NULL);
bufferC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * 0, N * sizeof(int), a, 0, NULL, NULL);
clEnqueueWriteBuffer(queue, bufferA, CL_TRUE, 0, N * sizeof(int), b, 0, NULL, NULL);
clEnqueueWriteBuffer(queue, bufferB, CL_TRUE,
// 创建并编译 OpenCL 程序
1, (const char **)&kernelSource, NULL, NULL);
program = clCreateProgramWithSource(context, 1, &device, NULL, NULL, NULL);
clBuildProgram(program,
// 创建内核对象
"addVectors", NULL);
kernel = clCreateKernel(program,
// 设置内核参数并执行内核
0, sizeof(cl_mem), (void *)&bufferA);
clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&bufferB);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&bufferC);
clSetKernelArg(kernel, size_t globalWorkSize = N;
1, NULL, &globalWorkSize, NULL, 0, NULL, NULL);
clEnqueueNDRangeKernel(queue, kernel,
// 从缓冲区中读取结果
0, N * sizeof(int), c, 0, NULL, NULL);
clEnqueueReadBuffer(queue, bufferC, CL_TRUE,
// 打印结果
for (int i = 0; i < N; ++i) {
"%d + %d = %d\n", a[i], b[i], c[i]);
printf(
}
// 释放资源
clReleaseMemObject(bufferA);
clReleaseMemObject(bufferB);
clReleaseMemObject(bufferC);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
return 0;
}