Here are 4 code snippets that are roughly equivalent. All of them will run the following functions, in some order, possibly in parallel:
foo(0) foo(1) ... foo(55554)
int main() { #pragma omp parallel for for (int i = 0; i < 55555; ++i) { foo(i); } }
int main() { #pragma omp parallel #pragma omp single { for (int i = 0; i < 55555; ++i) { #pragma omp task foo(i); } } }
__global__ void kernel() { int i = blockIdx.x; foo(i); } int main() { kernel<<<55555, 1>>>(); cudaDeviceSynchronize(); }
__global__ void kernel() { int i = 100 * blockIdx.x + threadIdx.x; if (i >= 55555) { return; } foo(i); } int main() { kernel<<<556, 100>>>(); cudaDeviceSynchronize(); }