Introduction to CUDA C
Introduction to CUDA C Introduction to CUDA C
Parallel Dot Product: main() #define N 512 int main( void ) { int *a, *b, *c; // copies of a, b, c int *dev_a, *dev_b, *dev_c; // device copies of a, b, c int size = N * sizeof( int ); // we need space for 512 integers // allocate device copies of a, b, c cudaMalloc( (void**)&dev_a, size ); cudaMalloc( (void**)&dev_b, size ); cudaMalloc( (void**)&dev_c, sizeof( int ) ); a = (int *)malloc( size ); b = (int *)malloc( size ); c = (int *)malloc( sizeof( int ) ); random_ints( a, N ); random_ints( b, N );
Parallel Dot Product: main() } // copy inputs to device cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice ); cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice ); // launch dot() kernel with 1 block and N threads dot>( dev_a, dev_b, dev_c ); // copy device result back to host copy of c cudaMemcpy( c, dev_c, sizeof( int ) , cudaMemcpyDeviceToHost ); free( a ); free( b ); free( c ); cudaFree( dev_a ); cudaFree( dev_b ); cudaFree( dev_c ); return 0;
- Page 1 and 2: Introduction to CUDA C San Jose Con
- Page 3 and 4: What is CUDA? � CUDA Architecture
- Page 5 and 6: CUDA C Prerequisites � You (proba
- Page 7 and 8: Hello, World! int main( void ) { pr
- Page 9 and 10: Hello, World! with Device Code __gl
- Page 11 and 12: A More Complex Example � A simple
- Page 13 and 14: Memory Management � Host and devi
- Page 15 and 16: A More Complex Example: main() int
- Page 17 and 18: Parallel Programming in CUDA C �
- Page 19 and 20: Parallel Programming in CUDA C �
- Page 21 and 22: Parallel Addition: main() #define N
- Page 23 and 24: Review � Difference between “ho
- Page 25 and 26: Threads � Terminology: A block ca
- Page 27 and 28: Parallel Addition (Threads): main()
- Page 29 and 30: Indexing Arrays With Threads And Bl
- Page 31 and 32: Addition with Threads and Blocks
- Page 33 and 34: Parallel Addition (Blocks/Threads):
- Page 35 and 36: Dot Product � Unlike vector addit
- Page 37 and 38: Dot Product � But we need to shar
- Page 39 and 40: Parallel Dot Product: dot() � We
- Page 41 and 42: Faulty Dot Product Exposed! � Ste
- Page 43 and 44: Synchronization � We need threads
- Page 45: Parallel Dot Product: dot() __globa
- Page 49 and 50: Review (cont) � Using __shared__
- Page 51 and 52: Multiblock Dot Product: Algorithm
- Page 53 and 54: Multiblock Dot Product: dot() #defi
- Page 55 and 56: Global Memory Contention Block 0 su
- Page 57 and 58: Atomic Operations � Terminology:
- Page 59 and 60: Parallel Dot Product: main() #defin
- Page 61 and 62: Review � Race conditions — Beha
- Page 63: Questions � First my questions
Parallel Dot Product: main()<br />
}<br />
// copy inputs <strong>to</strong> device<br />
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice );<br />
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice );<br />
// launch dot() kernel with 1 block and N threads<br />
dot>( dev_a, dev_b, dev_c );<br />
// copy device result back <strong>to</strong> host copy of c<br />
cudaMemcpy( c, dev_c, sizeof( int ) , cudaMemcpyDeviceToHost );<br />
free( a ); free( b ); free( c );<br />
cudaFree( dev_a );<br />
cudaFree( dev_b );<br />
cudaFree( dev_c );<br />
return 0;