Accelerators - Any non-CPU piece of hardware that can carry out work
There are two major examples of this:
Kernel - The name for a portion of code is sent to an accelerator Offloading - The term for sending work intended for the CPU to an accelerator
Programmatic
Directive
To address it’s issues OpenACC had 3 major design pillars
Simplicity - OpenCL can be quite tricky to learn OpenCL requires several steps to get a kernel to run on an accelerator
Power - OpenCL has poor support for crypto and machine learning
Portability - There was no dominant manufacturer so supporting a variety of accelerators was important
A directive is just giving the compiler general instructions Compiler developers have much more freedom to figure out exactly how best to program that
#pragma acc kernels
- Automatically distribute loops across accelerator threads
Example for express parallelism
#pragma acc kernels {
for (int i = 0; i < N; i++) {
x[i] = i;
y[i] = i * i;
}
for (int i = 0; i < N; i++) {
z[i] = x[i] * y[i];
}
}
#pragma acc parallel
- Explicitly state that this portion of code should be parallelised. Works for loops or segments of code
Example
#pragma acc parallel {
for (int i = 0 ; i < N; i++) {
x[i] = i;
y[i] = i * i;
}
}
#pragma acc loop
- Explicitly state that this portion of code should be workload shared in a similar way to OpenMP
Example
#pragma acc loop {
for (int i = 0 ; i < N; i++) {
x[i] = i;
y[i] = i * i;
}
}
#pragma acc data
- This sets up data permanence on the accelerator, this is regardless of threads being created and destroyed
#pragma acc data {
#pragma acc parallel {
for (int i = 0 ; i < N; i++) {
x[i] = i;
y[i] = i * i;
}
}
}
copy
- Specify what data must go to the accelerator, this then returns
#pragma acc data copy (x, y)
#pragma acc parallel {
for (int i = 0 ; i < N; i++) {
x[i] = i;
y[i] = i * i;
}
}
copyout
- This tells the compiler the data should be returned, if it isn’t specified as copied in then it creates a variable on the accelerator
create
- This creates a local variable on the accelerator, this is good for temporary variables
Omputational intensity - The proportion of mathematical operations to unique memory operations.
Maths isquite fast, floating point muktiplication takes about clock cycles Memory is quite slow, loading from RAM takes arounds.
Mathematical operations | Unique memory operations |
---|---|
a[i] = b[i] + c[i] | 1: 3 3:300 |
a[i] = b[i] + c[i] _ d[i] | 2: 4 6:400 |
a[i]++ | 1: 2 3:200 |
a[i] += a[i] _ a[i] | 2: 1 6:100 |
Work item - A thread associated with a particular elements of the computer
Global dimensions - How many dimensions does the data have
Local dimensions - A smallerworkable chunk of the global dimension
The power of GPUs is growing at a much faster rate than CPUs
The speed of improvement is impressive, it is likely that languages will start adding dedicated support for GPUs by default
There are two major factors stopping widespread adoption
Field Programmable Gate Arrays (FPGA)
Configurable Logic Blocks (CLB)
AI Accelerator (AIA)
Cryptographic Accelerator (CA)
It’s likely to be the language you will program in if you go into heavy HPC work So why might somebody choose this over OpenCL?
Architecture
Memory Model
Programming Model
kernel
is a program written for a GPU. We write the program in CUDA C, and gives the instructions for one thread. Then we run many threads on the GPU at the same timeGrid
of threads represents all threads launched on the GPUWe access memory in CUDA C like we would do in a C array
When we access shared mempry, we can access b distinct banks in parallel
Example:
// Kernel definition
// CUDA Kernel Device code
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < numElements) {
C[i] = A[i] + B[i] + 0.0f
}
}
CUDA Recipe - This is the name of the standard approach to carrying out this task
Convert the code into a serial kernel
int *calculate(int *a, int *b, int *c) {
for(i = 0; i < 40; i++){
c[i] = a[i] + b[i];
}
return c;
}
int main() {
int i;
int *a = (int *)malloc(40 * sizeof(int));
int *b = (int *)malloc(40 * sizeof(int));
int *c = (int *)malloc(40 * sizeof(int));
// Read a file to assign a and b values
c = calculate(a, b, c);
free(a);
free(b);
free(c);
}
Make it a CUDA kernel by adding “global”
__global__
int *calculate(int *a, int *b, int *c) {
for(i = 0; i < 40; i++){
c[i] = a[i] + b[i];
}
return c;
}
int main() {
int i;
int *a = (int *)malloc(40 * sizeof(int));
int *b = (int *)malloc(40 * sizeof(int));
int *c = (int *)malloc(40 * sizeof(int));
// Read a file to assign a and b values
c = calculate(a, b, c);
free(a);
free(b);
free(c);
}
Replace loops in the CUDA kernel (thread id indexing)
__global__
int *calculate(int *a, int *b, int *c){
int thread_id = (blockIdx.x * blockDim.x) + threadIdx.x
for(i = 0; i < 40; i++){
c[thread_id] = a[thread_id] + b[thread_id];
}
return c;
}
int main() {
int i;
int *a = (int *)malloc(40 * sizeof(int));
int *b = (int *)malloc(40 * sizeof(int));
int *c = (int *)malloc(40 * sizeof(int));
// Read a file to assign a and b values
c = calculate(a, b, c);
free(a);
free(b);
free(c);
}
Handle problem sizes smaller than the thread number
__global__
int *calculate(int *a, int *b, int *c){
int thread_id = (blockIdx.x * blockDim.x) +
threadIdx.x;
if(thread_id < 40)
c[thread_id] = a[thread_id] + b[thread_id];
return c;
}
Allocate device memory
__global__
int *calculate(int *a, int *b, int *c){
int thread_id = (blockIdx.x * blockDim.x) + threadIdx.x;
if(thread_id < 40)
c[thread_id] = a[thread_id] + b[thread_id];
return c;
}
int main() {
int i;
int *device_a, *device_b, *device_c;
int *a = (int *)malloc(40 * sizeof(int));
int *b = (int *)malloc(40 * sizeof(int));
int *c = (int *)malloc(40 * sizeof(int));
// set up device memory
cudaMalloc(&device_a, 40 * sizeof(int));
cudaMalloc(&device_b, 40 * sizeof(int));
cudaMalloc(&device_c, 40 * sizeof(int));
// Read a file to assign a and b values
c = calculate(a, b, c);
free(a);
free(b);
free(c);
}
Copy data from host to device
__global__
int *calculate(int *a, int *b, int *c){
int thread_id = (blockIdx.x * blockDim.x) + threadIdx.x;
if(thread_id < 40)
c[thread_id] = a[thread_id] + b[thread_id];
return c;
}
int main() {
int i;
int *device_a, *device_b, *device_c;
int *a = (int *)malloc(40 * sizeof(int));
int *b = (int *)malloc(40 * sizeof(int));
int *c = (int *)malloc(40 * sizeof(int));
// set up device memory
cudaMalloc(&device_a, 40 * sizeof(int));
cudaMalloc(&device_b, 40 * sizeof(int));
cudaMalloc(&device_c, 40 * sizeof(int));
// Read a file to assign a and b values
// copy to device
cudaMemcpy(device_a, a, 40 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(device_b, b, 40 * sizeof(int), cudaMemcpyHostToDevice);
c = calculate(a, b, c);
free(a);
free(b);
free(c);
}
Call the CUDA kernel
__global__
int *calculate(int *a, int *b, int *c){
int thread_id = (blockIdx.x * blockDim.x) + threadIdx.x;
if(thread_id < 40)
c[thread_id] = a[thread_id] + b[thread_id];
return c;
}
int main() {
int i;
int *device_a, *device_b, *device_c;
int *a = (int *)malloc(40 * sizeof(int));
int *b = (int *)malloc(40 * sizeof(int));
int *c = (int *)malloc(40 * sizeof(int));
// set up device memory
cudaMalloc(&device_a, 40 * sizeof(int));
cudaMalloc(&device_b, 40 * sizeof(int));
cudaMalloc(&device_c, 40 * sizeof(int));
// Read a file to assign a and b values
// copy to device
cudaMemcpy(device_a, a, 40 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(device_b, b, 40 * sizeof(int), cudaMemcpyHostToDevice);
c = calculate<<<5, 8>>>(a, b, c);
free(a);
free(b);
free(c);
}
Copy data from device to host
int main() {
int i;
int *device_a, *device_b, *device_c;
int *a = (int *)malloc(40 * sizeof(int));
int *b = (int *)malloc(40 * sizeof(int));
int *c = (int *)malloc(40 * sizeof(int));
// set up device memory
cudaMalloc(&device_a, 40 * sizeof(int));
cudaMalloc(&device_b, 40 * sizeof(int));
cudaMalloc(&device_c, 40 * sizeof(int));
// Read a file to assign a and b values
// copy to device
cudaMemcpy(device_a, a, 40 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(device_b, b, 40 * sizeof(int), cudaMemcpyHostToDevice);
c = calculate<<<5, 8>>>(a, b, c);
// copy results back
cudaMemcpy(c, device_c, 40 * sizeof(int), cudaMemcpyDeviceToHost);
free(a);
free(b);
free(c);
}