Parallelisum
Table of Contents
- 1. Dependence
- 2. Docker
- 3. Singularity
- 4. C/C++ with OpenMP
- 4.1. src compile config
- 4.2. useful libary functions
- 4.3. Constructor
- 4.4. Subconstructor
- 4.5. hello world
- 4.6. numthreads
- 4.7. for
- 4.8. ordered
- 4.9. parallel for
- 4.10. parallel while
- 4.11. sections and section
- 4.12. master
- 4.13. private
- 4.14. firstprivate
- 4.15. lastprivate
- 4.16. shared
- 4.17. reduction
- 4.18. default(shared)
- 4.19. schedule(static)
- 4.20. schedule(dynamic)
- 4.21. schedule(guided)
- 5. C/C++ with mpi
- 6. Python
- 7. Python with mpi
- 8. Go
- 9. Go with mpi
- 9.1. cpuNum
- 9.2. IsOn
- 9.3. WorldRank
- 9.4. WorldSize
- 9.5. NewCommunicator
- 9.6. SendInt32 && RecvInt32
- 9.7. SendInt32s && RecvInt32s
- 9.8. SendString && RecvString
- 9.9. ReduceInt32s
- 9.10. AllreduceInt64s
- 9.11. BcastInt64s
- 9.12. distributed-client
- 9.13. Distributed learning server client
- 9.14. Distributed learning server client one to one
- 10. GPGPU
1. Dependence
sudo apt install openmpi-bin sudo apt install libopenmpi-dev
2. Docker
2.0.1. old note
2.0.2. commands
- docker pull NAME
- docker images
- docker images -a
- docker run NAME
- docker run NAME:VERSION
- docker run -d NAME
- docker run -p6000:6379 -d redis
- docker run -p6000:6379 -d –name redis-older redis
- docker exec -it ID /bin/bash
- docker exec -it NAME /bin/bash
- docker ps
- docker ps -a2
- docker start ID
- docker stop ID
- docker logs ID
2.0.3. discripation
fast delivery |
easy deploy and scale |
run more workloads |
easy management |
lightweight image |
No guest OS |
- Disadvanage: can not change kernel
- Less secure than VM
- Docker Registry hold Docker Images
- Docker Images create Docker Container(run as instance)
- Docker Container runs applications
- the shared levels are readonly, and containers communicate with sockets or Restful API
2.0.4. delete
- delete docker
docker ps -a // get dockerid or dockername docker stop Did/Dname docker rm Did/Dname
- delete image docker images // get image id or image name docker rmi Iid/Iname perhaps some none name image should also be rm
- sudo docker rm $(sudo docker ps -aq)
3. Singularity
3.1. build [sandbox]
build target source
source can be singularityfile, dockerhubname, online resource, sandbox simg file, singularity sif file
sudo singularity build singularityfile.sif singularityfile
sudo singularity build singularity.sif docker://dockerhubname
sudo singularity build --sandbox singularityfile.simg singularityfile
sudo singularity build --sandbox singularity.simg docker://dockerhubname
sudo singularity shell singularityfile.simg
sudo singularity shell --writable singularityfile.simg
sudo singularity shell --bind /home/silin/go:/home/go -w --nv centos-7.simg
4. C/C++ with OpenMP
4.1. src compile config
C-h v : org-babel-C-compiler change gcc to gcc -fopenmp
4.2. useful libary functions
4.2.1. ompgetthreadnum
int main(int argc, char* argv[]) { #pragma omp parallel { printf("omp_get_thread_num: %d\n", omp_get_thread_num()); } }
omp_get_thread_num: 0 omp_get_thread_num: 5 omp_get_thread_num: 3 omp_get_thread_num: 2 omp_get_thread_num: 7 omp_get_thread_num: 6 omp_get_thread_num: 4 omp_get_thread_num: 1
4.2.2. ompgetnumprocs
int main(int argc, char* argv[]) { printf("omp_get_num_procs: %d\n", omp_get_num_procs()); return 0; }
omp_get_num_procs: 8
4.2.3. ompgetnumthreads
int main(int argc, char* argv[]) { #pragma omp parallel printf("omp_get_num_threads: %d\n", omp_get_num_threads()); #pragma omp parallel #pragma omp master { printf("omp_get_num_threads: %d\n", omp_get_num_threads()); } return 0; }
omp_get_num_threads: 8 omp_get_num_threads: 8 omp_get_num_threads: 8 omp_get_num_threads: 8 omp_get_num_threads: 8 omp_get_num_threads: 8 omp_get_num_threads: 8 omp_get_num_threads: 8 omp_get_num_threads: 8
4.2.4. ompsetnumthreads
int main() { omp_set_num_threads(4); printf("%d\n", omp_get_num_threads( )); #pragma omp parallel #pragma omp master { printf("%d\n", omp_get_num_threads( )); } }
1 4
4.2.5. ompgetwtime
4.2.6. ompgetwtick
4.2.7. ompinitlock
4.2.8. ompgetlock
4.2.9. ompunsetlock
4.2.10. omptestlock
4.2.11. ompdestroylock
4.3. Constructor
parallel |
for |
parallel for |
sections |
parallel sections |
critical |
single |
barrier |
atomic |
master |
ordered |
threadprivate |
4.4. Subconstructor
parivate |
firstprivate |
lastprivate |
reduce |
nowait |
numthreads |
schedule |
shared |
ordered |
copyprivate |
copyin |
default |
4.5. hello world
int main(int argc, char* argv[]) { #pragma omp parallel { printf("Hello world from thread %d\n", omp_get_thread_num()); } }
Hello world from thread 0 Hello world from thread 1 Hello world from thread 4 Hello world from thread 3 Hello world from thread 5 Hello world from thread 7 Hello world from thread 6 Hello world from thread 2
4.6. numthreads
int main(int argc, char* argv[]) { #pragma omp parallel num_threads(6) { printf("omp_get_num_threads: %d\n", omp_get_num_threads()); } return 0; }
omp_get_num_threads: 6 omp_get_num_threads: 6 omp_get_num_threads: 6 omp_get_num_threads: 6 omp_get_num_threads: 6 omp_get_num_threads: 6
4.7. for
only works in one thread
int main(int argc, char* argv[]) { int j = 0; #pragma omp for for (j = 0; j < 8; j++){ printf("j = %d, ThreadID = %d\n", j, omp_get_thread_num()); } return 0; }
j = 0, ThreadID = 0 j = 1, ThreadID = 0 j = 2, ThreadID = 0 j = 3, ThreadID = 0 j = 4, ThreadID = 0 j = 5, ThreadID = 0 j = 6, ThreadID = 0 j = 7, ThreadID = 0
4.8. ordered
int main(int argc, char* argv[]) { int j = 0; #pragma omp parallel for ordered for (j = 0; j < 2; j++){ #pragma omp ordered { printf("01 ordered j = %d, ThreadID = %d\n", j, omp_get_thread_num()); } #pragma omp ordered { printf("02 ordered j = %d, ThreadID = %d\n", j, omp_get_thread_num()); } #pragma omp ordered { printf("03 ordered j = %d, ThreadID = %d\n", j, omp_get_thread_num()); } } return 0; }
01 ordered j = 0, ThreadID = 0 02 ordered j = 0, ThreadID = 0 03 ordered j = 0, ThreadID = 0 01 ordered j = 1, ThreadID = 1 02 ordered j = 1, ThreadID = 1 03 ordered j = 1, ThreadID = 1
4.9. parallel for
for should work with parallel
int main(int argc, char* argv[]) { int j = 0; #pragma omp parallel for for (j = 0; j < 8; j++){ printf("j = %d, ThreadID = %d\n", j, omp_get_thread_num()); } return 0; }
j = 0, ThreadID = 0 j = 1, ThreadID = 1 j = 2, ThreadID = 2 j = 6, ThreadID = 6 j = 5, ThreadID = 5 j = 7, ThreadID = 7 j = 3, ThreadID = 3 j = 4, ThreadID = 4
int main(int argc, char* argv[]) { int j = 0; #pragma omp parallel for for (j = 0; j < 8; j++){ printf("j = %d, ThreadID = %d\n", j, omp_get_thread_num()); } return 0; }
j = 7, ThreadID = 7 j = 3, ThreadID = 3 j = 6, ThreadID = 6 j = 2, ThreadID = 2 j = 4, ThreadID = 4 j = 0, ThreadID = 0 j = 1, ThreadID = 1 j = 5, ThreadID = 5
and this can be rewrite as following, for in parallel, but can be multiple for block
int main(int argc, char* argv[]) { int j = 0; #pragma omp parallel num_threads(4) // #pragma omp parallel // EQUAL TO #pragma omp parallel num_threads(8) { #pragma omp for for (j = 0; j <8; j++) { printf("j = %d, ThreadID = %d\n", j, omp_get_thread_num()); } } return 0; }
j = 0, ThreadID = 0 j = 1, ThreadID = 0 j = 6, ThreadID = 3 j = 7, ThreadID = 3 j = 2, ThreadID = 1 j = 3, ThreadID = 1 j = 4, ThreadID = 2 j = 5, ThreadID = 2
4.10. parallel while
for should work with parallel
int main(int argc, char* argv[]) { int j = 0; #pragma omp parallel num_threads(8) shared(j) while (j < 8){ printf("j = %d, ThreadID = %d\n", j, omp_get_thread_num()); j++; } return 0; }
j = 0, ThreadID = 0 j = 0, ThreadID = 7 j = 0, ThreadID = 3 j = 2, ThreadID = 3 j = 3, ThreadID = 3 j = 4, ThreadID = 3 j = 5, ThreadID = 3 j = 6, ThreadID = 3 j = 7, ThreadID = 3 j = 0, ThreadID = 6 j = 0, ThreadID = 4 j = 0, ThreadID = 1 j = 0, ThreadID = 5 j = 1, ThreadID = 0 j = 0, ThreadID = 2
4.11. sections and section
void main(int argc, char *argv) { #pragma omp parallel sections { #pragma omp section printf("section 1 ThreadId = %d\n", omp_get_thread_num()); #pragma omp section printf("section 2 ThreadId = %d\n", omp_get_thread_num()); #pragma omp section printf("section 3 ThreadId = %d\n", omp_get_thread_num()); #pragma omp section printf("section 4 ThreadId = %d\n", omp_get_thread_num()); } }
section 4 ThreadId = 0 section 3 ThreadId = 5 section 2 ThreadId = 3 section 1 ThreadId = 6
another form, parallel and sections can also be separated, and with mulitple sections block but each sections are not paralleled, but concatenated
void main(int argc, char *argv) { #pragma omp parallel { #pragma omp sections { #pragma omp section printf("section 1 ThreadId = %d\n", omp_get_thread_num()); #pragma omp section printf("section 2 ThreadId = %d\n", omp_get_thread_num()); #pragma omp section printf("section 3 ThreadId = %d\n", omp_get_thread_num()); } #pragma omp sections { #pragma omp section printf("section 1 ThreadId = %d\n", omp_get_thread_num()); #pragma omp section printf("section 2 ThreadId = %d\n", omp_get_thread_num()); #pragma omp section printf("section 3 ThreadId = %d\n", omp_get_thread_num()); } } }
section 1 ThreadId = 3 section 2 ThreadId = 0 section 3 ThreadId = 4 section 2 ThreadId = 0 section 1 ThreadId = 1 section 3 ThreadId = 6
4.12. master
void main(int argc, char *argv) { #pragma omp parallel #pragma omp master { printf("I am the master\n"); } }
I am the master
4.13. private
after the block of private for, there are some wired to execute some out of parallelisum
void main(int argc, char *argv) { int k = 100; #pragma omp parallel for private(k) for ( k=0; k < 8; k++) { printf("k=%d\n", k); } printf("kkk = %d\n", k); #pragma omp parallel sections { #pragma omp section printf("kkk = %d\n", k); } }
k=6 k=7 k=3 k=2 k=0 k=4 k=5 k=1 kkk = 100 kkk = 100
4.14. firstprivate
take the vaiable to Parallelisum, but not influence back
void main(int argc, char *argv) { int k = 100; #pragma omp parallel for firstprivate(k) for (int i=0; i < 8; i++) { k += i; printf("k=%d\n", k); } printf("k out of parall = %d\n", k); }
k=103 k=107 k=100 k=104 k=106 k=102 k=105 k=101 k out of parall = 100
4.15. lastprivate
take the vaiable to Parallelisum, and influence back
void main(int argc, char *argv) { int k = 100; #pragma omp parallel for firstprivate(k) lastprivate(k) for (int i=0; i < 8; i++) { k += i; printf("k=%d\n", k); } printf("k out of parall = %d\n", k); }
k=100 k=107 k=106 k=102 k=101 k=105 k=103 k=104 k out of parall = 107
4.16. shared
void main(int argc, char *argv) { int k = 100; int i = 1; #pragma omp parallel for shared(k) for ( i=0; i < 8; i++) { printf("k=%d\n", k); } }
k=100 k=100 k=100 k=100 k=100 k=100 k=100 k=100 k = 100
4.17. reduction
void main(int argc, char *argv) { int sum = 0; int i = 0; #pragma omp parallel for reduction(+:sum) for ( i=0; i < 10; i++) { printf("omp_get_thread_num: %d\n", omp_get_thread_num()); sum += i; } printf("sum is %d\n", sum); }
omp_get_thread_num: 0 omp_get_thread_num: 0 omp_get_thread_num: 1 omp_get_thread_num: 1 omp_get_thread_num: 7 omp_get_thread_num: 2 omp_get_thread_num: 5 omp_get_thread_num: 4 omp_get_thread_num: 6 omp_get_thread_num: 3 sum is 45
void main(int argc, char *argv) { int sum = 0; int i = 10; #pragma omp parallel reduction(+:sum) { printf("omp_get_thread_num: %d\n", omp_get_thread_num()); sum += i; } printf("sum is %d\n", sum); }
omp_get_thread_num: 0 omp_get_thread_num: 3 omp_get_thread_num: 4 omp_get_thread_num: 2 omp_get_thread_num: 7 omp_get_thread_num: 6 omp_get_thread_num: 5 omp_get_thread_num: 1 sum is 80
4.18. default(shared)
void main(int argc, char *argv) { int k = 100; int i = 1; #pragma omp parallel for default(shared) for ( i=0; i < 8; i++) { printf("k=%d\n", k); } }
k=100 k=100 k=100 k=100 k=100 k=100 k=100 k=100 i=1
4.19. schedule(static)
all iterations will be equally divied to each thread schedule(static, size): size time iterations will be drived each time
void main(int argc, char *argv) { int i = 0; #pragma omp parallel for schedule(static) for (i = 0; i < 24 ; i++) { printf("i=%d, thread_id=%d\n", i, omp_get_thread_num()); } }
i=0, thread_id=0 i=14, thread_id=7 i=15, thread_id=7 i=2, thread_id=1 i=12, thread_id=6 i=3, thread_id=1 i=18, thread_id=1 i=8, thread_id=4 i=10, thread_id=5 i=19, thread_id=1 i=9, thread_id=4 i=1, thread_id=0 i=11, thread_id=5 i=16, thread_id=0 i=13, thread_id=6 i=4, thread_id=2 i=6, thread_id=3 i=7, thread_id=3 i=22, thread_id=3 i=23, thread_id=3 i=17, thread_id=0 i=5, thread_id=2 i=20, thread_id=2 i=21, thread_id=2
4.20. schedule(dynamic)
all iterations will be dynamically divied to each thread schedule(dynamic, size): size time iterations will be drived each time
void main(int argc, char *argv) { int i = 0; #pragma omp parallel for schedule(dynamic) for (i = 0; i < 16 ; i++) { printf("i=%d, thread_id=%d\n", i, omp_get_thread_num()); } }
i=7, thread_id=7 i=8, thread_id=7 i=9, thread_id=7 i=10, thread_id=7 i=11, thread_id=7 i=12, thread_id=7 i=13, thread_id=7 i=14, thread_id=7 i=15, thread_id=7 i=0, thread_id=3 i=6, thread_id=1 i=5, thread_id=5 i=1, thread_id=6 i=2, thread_id=2 i=4, thread_id=0 i=3, thread_id=4
4.21. schedule(guided)
void main(int argc, char *argv) { int i = 0; #pragma omp parallel for schedule(guided) for (i = 0; i < 16 ; i++) { printf("i=%d, thread_id=%d\n", i, omp_get_thread_num()); } }
i=8, thread_id=0 i=11, thread_id=7 i=12, thread_id=7 i=13, thread_id=7 i=14, thread_id=7 i=15, thread_id=7 i=6, thread_id=4 i=7, thread_id=4 i=2, thread_id=2 i=3, thread_id=2 i=4, thread_id=6 i=5, thread_id=6 i=9, thread_id=1 i=10, thread_id=5 i=0, thread_id=3 i=1, thread_id=3
5. C/C++ with mpi
sudo apt install libopenmpi-dev
5.1. src compile config
C-h v : org-babel-C-compiler change gcc to mpicc mpirun –use-hwthread-cpus mpirun –host si-u20:8
5.2. MPICOMMWORLD
#include <stdio.h> #include <stdlib.h> #include <mpi.h> int main(int argc, char* argv[]) { MPI_Init(&argc, &argv); int my_rank; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); printf("I am MPI process %d.\n", my_rank); MPI_Finalize(); return EXIT_SUCCESS; }
I am MPI process 0.
5.3. hello world of processor
#include <mpi.h> #include <stdio.h> int main(int argc, char** argv) { MPI_Init(NULL, NULL); int world_size; MPI_Comm_size(MPI_COMM_WORLD, &world_size); int world_rank; MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); char processor_name[MPI_MAX_PROCESSOR_NAME]; int name_len; MPI_Get_processor_name(processor_name, &name_len); printf("Hello world from processor %s, rank %d out of %d processors\n", processor_name, world_rank, world_size); MPI_Finalize(); }
Hello world from processor silin, rank 0 out of 1 processors
5.4. MPICOMMCREAT
#include <stdio.h> #include <stdlib.h> #include <mpi.h> int main(int argc, char* argv[]) { MPI_Init(&argc, &argv); printf("\n"); // Check that the application is run with 6 processes. int size; MPI_Comm_size(MPI_COMM_WORLD, &size); if(size != 6) { printf("Please run this application with 6 MPI processes.\n"); MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); } // Get my rank in the global communicator int my_rank_global; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank_global); // Get the group or processes of the default communicator MPI_Group world_group; MPI_Comm_group(MPI_COMM_WORLD, &world_group); // Create the new communicator from that group of processes. MPI_Comm local_communicator; if(my_rank_global == 0 || my_rank_global == 1 || my_rank_global == 2) { // Keep MPI processes 0, 1, 2 in a group we will call group A. int group_a_ranks[3] = {0, 1, 2}; MPI_Group group_a; MPI_Group_incl(world_group, 3, group_a_ranks, &group_a); MPI_Comm_create(MPI_COMM_WORLD, group_a, &local_communicator); } else if(my_rank_global == 3 || my_rank_global == 4) { // Keep MPI processes 3 and 4 in a group we will call group B. int group_b_ranks[2] = {3, 4}; MPI_Group group_b; MPI_Group_incl(world_group, 2, group_b_ranks, &group_b); MPI_Comm_create(MPI_COMM_WORLD, group_b, &local_communicator); } else { // Only MPI process 5 remains, without a group MPI_Comm_create(MPI_COMM_WORLD, MPI_GROUP_EMPTY, &local_communicator); } // Check if I got into a new communicator (that is, if I was in a group to begin with) if(local_communicator == MPI_COMM_NULL) { // I am not part of the new communicator, I can't participate to that broadcast. printf("MPI process %d was not part of any group, thus did not get into a new communicator.\n", my_rank_global); // Let's wait all processes before proceeding to the second phase; Cleaner output. MPI_Barrier(MPI_COMM_WORLD); } else { // Get my rank local to the new communicator int my_rank_local; MPI_Comm_rank(local_communicator, &my_rank_local); printf("MPI process %d in global communicator has now rank %d in new communicator.\n", my_rank_global, my_rank_local); // Let's wait all processes before proceeding to the second phase; Cleaner output. MPI_Barrier(MPI_COMM_WORLD); int size_local_communicator; MPI_Comm_size(local_communicator, &size_local_communicator); // The MPI process that got assigned rank 0 in each new communicator will gather all ranks in that communicator and display them int global_ranks_in_local_communicator[size_local_communicator]; MPI_Gather(&my_rank_global, 1, MPI_INT, global_ranks_in_local_communicator, 1, MPI_INT, 0, local_communicator); if(my_rank_local == 0) { printf("MPI processes "); for(int i = 0; i < size_local_communicator; i++) { if(i > 0) { printf(" and "); } printf("%d", global_ranks_in_local_communicator[i]); } printf(" are in the same new communicator.\n"); } } MPI_Finalize(); return EXIT_SUCCESS; }
cd ./babel
mpicc -o mpi_comm_create ./mpi_comm_create.c
mpirun --host silin:6 ./mpi_comm_create
MPI process 4 in global communicator has now rank 1 in new communicator. MPI process 2 in global communicator has now rank 2 in new communicator. MPI process 0 in global communicator has now rank 0 in new communicator. MPI process 5 was not part of any group, thus did not get into a new communicator. MPI process 1 in global communicator has now rank 1 in new communicator. MPI process 3 in global communicator has now rank 0 in new communicator. MPI processes 3 and 4 are in the same new communicator. MPI processes 0 and 1 and 2 are in the same new communicator.
5.5. AllReduce
int main(int argc, char* argv[]) { MPI_Init(&argc, &argv); // Get the size of the communicator int size = 0; MPI_Comm_size(MPI_COMM_WORLD, &size); if(size != 4) { printf("This application is meant to be run with 4 MPI processes.\n"); MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); } // Get my rank int my_rank; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); // Each MPI process sends its rank to reduction, root MPI process collects the result int reduction_result = 0; MPI_Allreduce(&my_rank, &reduction_result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); printf("[MPI Process %d] The sum of all ranks is %d.\n", my_rank, reduction_result); MPI_Finalize(); return EXIT_SUCCESS; }
6. Python
6.1. Multi Thread
6.1.1. pure thread start
import threading def myfunction(a,b): print(a*b) thread1 = threading.Thread(target= myfunction, args=(2,3)) thread2 = threading.Thread(target= myfunction, args=(3,4)) thread1.start() thread2.start()
6.1.2. thread class
rewrite init and run in class, which inherit class threading.Thread, and start it with start()
import threading import time class myThread(threading.Thread): def __init__(self, threadID, name, counter): threading.Thread.__init__(self) self.threadID = threadID self.name = name self.counter = counter def run(self): print("Starting " + self.name) self.print_time(self.name, 5, self.counter) print("Exiting" + self.name) def print_time(self, threadName, counter, delay): while counter: time.sleep(delay) print(f"{threadName}, {time.ctime(time.time())}") counter -= 1 thread1 = myThread(1, "threadone", 1) thread2 = myThread(2, "threadtwo", 2) thread1.start() thread2.start() time.sleep(1) print("Exiting Main Thread.")
Starting threadone Starting threadtwo threadone, Tue Jun 7 23:53:02 2022 Exiting Main Thread. threadtwo, Tue Jun 7 23:53:03 2022 threadone, Tue Jun 7 23:53:03 2022 threadone, Tue Jun 7 23:53:04 2022 threadtwo, Tue Jun 7 23:53:05 2022 threadone, Tue Jun 7 23:53:05 2022 threadone, Tue Jun 7 23:53:06 2022 Exitingthreadone threadtwo, Tue Jun 7 23:53:07 2022 threadtwo, Tue Jun 7 23:53:09 2022 threadtwo, Tue Jun 7 23:53:11 2022 Exitingthreadtwo
6.1.3. thread with Lock
import threading import time myLock = threading.Lock() class myThread(threading.Thread): def __init__(self, threadID, name, counter): threading.Thread.__init__(self) self.threadID = threadID self.name = name self.counter = counter def run(self): print("Starting " + self.name) myLock.acquire() self.print_time(self.name, 5, self.counter) myLock.release() print("Exiting" + self.name) def print_time(self, threadName, counter, delay): while counter: time.sleep(delay) print(f"{threadName}, {time.ctime(time.time())}") counter -= 1 thread1 = myThread(1, "threadone", 1) thread2 = myThread(2, "threadtwo", 2) thread1.start() thread2.start() time.sleep(1) print("Exiting Main Thread.")
Starting threadone Starting threadtwo threadone, Tue Jun 7 23:53:58 2022 Exiting Main Thread. threadone, Tue Jun 7 23:53:59 2022 threadone, Tue Jun 7 23:54:00 2022 threadone, Tue Jun 7 23:54:01 2022 threadone, Tue Jun 7 23:54:02 2022 Exitingthreadone threadtwo, Tue Jun 7 23:54:04 2022 threadtwo, Tue Jun 7 23:54:06 2022 threadtwo, Tue Jun 7 23:54:08 2022 threadtwo, Tue Jun 7 23:54:10 2022 threadtwo, Tue Jun 7 23:54:12 2022 Exitingthreadtwo
6.2. Multi processing
6.2.1. pure multi process start
import multiprocessing def myfunction(a,b): print(a*b) process1 = multiprocessing.Process(target=myfunction, args=(2,9)) process2 = multiprocessing.Process(target=myfunction, args=(3,4)) process1.start() process2.start()
18 12
6.2.2. multi process differ from multi threading
#+begin_src python :results output :exports both import multiprocessing import threading import random def myfunction(shared_result, lock=None): value = random.randint(1, 10) if lock: with lock: shared_result.append(value) else: shared_result.append(value) # Multiprocessing if __name__ == "__main__": with multiprocessing.Manager() as manager: result = manager.list() # Shared list for multiprocessing lock = multiprocessing.Lock() process1 = multiprocessing.Process(target=myfunction, args=(result, lock)) process2 = multiprocessing.Process(target=myfunction, args=(result, lock)) process1.start() process2.start() process1.join() process2.join() print("Result of multiprocessing:", list(result)) # Threading result = [] # Normal list for threading lock = threading.Lock() thread1 = threading.Thread(target=myfunction, args=(result, lock)) thread2 = threading.Thread(target=myfunction, args=(result, lock)) thread1.start() thread2.start() thread1.join() thread2.join() print("Result of multi-threading:", result)
Result of multiprocessing: [4, 1] Result of multi-threading: [8, 5]
#+endsrc
6.2.3. Pool
- apply
import multiprocessing as mp def myfunction(a, b): return a*b pool = mp.Pool(mp.cpu_count()) result = [ pool.apply(myfunction, args=(a, 2)) for a in range(1, 100) ] pool.close() print(result)
[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198]
- map
import multiprocessing as mp def myfunction(a): return a*a pool = mp.Pool(mp.cpu_count()) result = pool.map(myfunction, [a for a in range(1, 100)]) pool.close() print(result)
[1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361, 400, 441, 484, 529, 576, 625, 676, 729, 784, 841, 900, 961, 1024, 1089, 1156, 1225, 1296, 1369, 1444, 1521, 1600, 1681, 1764, 1849, 1936, 2025, 2116, 2209, 2304, 2401, 2500, 2601, 2704, 2809, 2916, 3025, 3136, 3249, 3364, 3481, 3600, 3721, 3844, 3969, 4096, 4225, 4356, 4489, 4624, 4761, 4900, 5041, 5184, 5329, 5476, 5625, 5776, 5929, 6084, 6241, 6400, 6561, 6724, 6889, 7056, 7225, 7396, 7569, 7744, 7921, 8100, 8281, 8464, 8649, 8836, 9025, 9216, 9409, 9604, 9801]
- starmap
import multiprocessing as mp def myfunction(a, b): return a*b pool = mp.Pool(mp.cpu_count()) result = pool.starmap(myfunction, [(a, 2) for a in range(1, 100)] ) pool.close() print(result)
[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198]
- applyasync
import multiprocessing as mp def myfunction(a, b): return a*b pool = mp.Pool(mp.cpu_count()) result = [ pool.apply_async(myfunction, args=(a, 2)) for a in range(1, 100)] result = [r.get() for r in result] pool.close() print(result)
[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198]
- applyasync with callback, must use wait
import multiprocessing as mp results = [] def myfunction(a, b): return a*b def collect_result(result): global results results.append(result) pool = mp.Pool(mp.cpu_count()) for a in range(1, 100): r = pool.apply_async(myfunction, args=(a, 2), callback=collect_result) r.wait() pool.close() print(results)
6.2.4. map
import time import concurrent.futures def do_something(n): print(f"sleeping {n} seconds") time.sleep(n) return f"Done {n} seconds sleep" with concurrent.futures.ProcessPoolExecutor() as executor: secs = [5,4,3,2,1] results = executor.map(do_something, secs) for result in results: print(result)
sleeping 1 seconds sleeping 2 seconds sleeping 3 seconds sleeping 4 seconds sleeping 5 seconds Done 5 seconds sleep Done 4 seconds sleep Done 3 seconds sleep Done 2 seconds sleep Done 1 seconds sleep
7. Python with mpi
7.1. send & recv
from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() if rank == 0: data = {'a': 1, 'b': 2, 'c':'test string'} comm.send(data,dest=1,tag=11) elif rank == 1: data = comm.recv(source=0,tag=11) print(data)
mpirun -n 2 python3 ./babel/mpi_python_example01.py
{'a': 1, 'b': 2, 'c': 'test string'}
7.2. Send & Recv
from mpi4py import MPI import numpy comm = MPI.COMM_WORLD rank = comm.Get_rank() if rank == 0: data = numpy.arange(100, dtype=numpy.float) comm.Send(data,dest=1,tag=11) elif rank == 1: data = numpy.empty(100,dtype=numpy.float) comm.Recv(data, source=0,tag=11) print(data)
mpirun -n 2 python3 ./babel/mpi_python_example02.py
[ 0. 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. 11. 12. 13. 14. 15. 16. 17. 18. 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35. 36. 37. 38. 39. 40. 41. 42. 43. 44. 45. 46. 47. 48. 49. 50. 51. 52. 53. 54. 55. 56. 57. 58. 59. 60. 61. 62. 63. 64. 65. 66. 67. 68. 69. 70. 71. 72. 73. 74. 75. 76. 77. 78. 79. 80. 81. 82. 83. 84. 85. 86. 87. 88. 89. 90. 91. 92. 93. 94. 95. 96. 97. 98. 99.]
7.3. Bcast
7.3.1. with map bcast
from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() if rank == 0: data = {'key1' : [7, 2.72, 2+3j], 'key2' : ( 'abc', 'xyz')} else: data = None data = comm.bcast(data, root=0) print(data)
mpirun -n 3 python3 ./babel/mpi_python_example03.py
{'key1': [7, 2.72, (2+3j)], 'key2': ('abc', 'xyz')} {'key1': [7, 2.72, (2+3j)], 'key2': ('abc', 'xyz')} {'key1': [7, 2.72, (2+3j)], 'key2': ('abc', 'xyz')}
7.3.2. with array Bcast
from mpi4py import MPI import numpy as np comm = MPI.COMM_WORLD rank = comm.Get_rank() if rank == 0: data = np.arange(100, dtype='i') else: data = np.empty(100, dtype='i') comm.Bcast(data, root=0) for i in range(100): assert data[i] == i
mpirun -n 3 python3 ./babel/mpi_python_example07.py
7.4. scatter
7.4.1. with number
from mpi4py import MPI comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() if rank == 0: data = [(i+1)**2 for i in range(size)] else: data = None data = comm.scatter(data, root=0) assert data == (rank+1)**2
mpirun -n 4 python3 ./babel/mpi_python_example04.py
7.4.2. with array
from mpi4py import MPI import numpy as np comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() sendbuf = None if rank == 0: sendbuf = np.empty([size, 100], dtype='i') sendbuf.T[:,:] = range(size) recvbuf = np.empty(100, dtype='i') comm.Scatter(sendbuf, recvbuf, root=0) assert np.allclose(recvbuf, rank)
mpirun --host si-u20:6 python3 ./babel/mpi_python_example08.py
7.5. Gather
7.5.1. with number gather
from mpi4py import MPI comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() data = (rank+1)**2 data = comm.gather(data, root=0) if rank == 0: for i in range(size): assert data[i] == (i+1)**2 else: assert data is None
mpirun --host si-u20:8 python3 ./babel/mpi_python_example05.py
8 8 8 8 8 8 8 8
7.5.2. with array Gather
from mpi4py import MPI import numpy as np comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() sendbuf = np.zeros(100, dtype='i') + rank recvbuf = None if rank == 0: recvbuf = np.empty([size, 100], dtype='i') comm.Gather(sendbuf, recvbuf, root=0) if rank == 0: for i in range(size): assert np.allclose(recvbuf[i,:], i)
mpirun -n 4 python3 ./babel/mpi_python_example09.py
7.6. Dynamic Process Management
from mpi4py import MPI import numpy import sys comm = MPI.COMM_SELF.Spawn(sys.executable, args=['./mpi_python_example_server.py'], maxprocs=2) N = numpy.array(100, 'i') comm.Bcast([N, MPI.INT], root=MPI.ROOT) PI = numpy.array(0.0, 'd') comm.Reduce(None, [PI, MPI.DOUBLE], op=MPI.SUM, root=MPI.ROOT) print(PI) comm.Disconnect()
from mpi4py import MPI import numpy comm = MPI.Comm.Get_parent() size = comm.Get_size() rank = comm.Get_rank() N = numpy.array(0, dtype='i') comm.Bcast([N, MPI.INT], root=0) h = 1.0 / N; s = 0.0 for i in range(rank, N, size): x = h * (i + 0.5) s += 4.0 / (1.0 + x**2) PI = numpy.array(s * h, dtype='d') comm.Reduce([PI, MPI.DOUBLE], None, op=MPI.SUM, root=0) comm.Disconnect()
mpirun --host si-u20:4 python3 ./babel/mpi_python_example_client.py
8. Go
8.1. go chan
package main import ( "fmt" "time" ) func main() { var times int go func() { for { } }() go func() { for { } }() go func() { for { } }() go func() { for { } }() go func() { for { } }() for times = 0; times <= 10; times++ { fmt.Println("tick", times) time.Sleep(time.Second) } }
package main import ( "fmt" "time" ) func main() { ch1 := make(chan int) go func (ch chan int){ for{ select { case <- ch: fmt.Println("get it") default: fmt.Println("still not") } } }(ch1) time.Sleep(time.Second) ch1 <- 2; }
9. Go with mpi
9.1. cpuNum
package main import ( "fmt" "runtime" ) func main() { cpuNum := runtime.NumCPU() fmt.Println("cpu number is :", cpuNum) runtime.GOMAXPROCS(cpuNum) }
cpu number is : 8
go build ./babel/go_mpi_cpuNum.go && ./babel/go_mpi_cpuNum
cpu number is : 8
9.2. IsOn
package main import ( "fmt" mpi "github.com/sbromberger/gompi" ) func main() { ison := mpi.IsOn() if ison { fmt.Println("ison is on") } else { fmt.Println("ison is not on") } mpi.Start(true) ison = mpi.IsOn() if ison { fmt.Println("ison is on") } else { fmt.Println("ison is not on ") } mpi.Stop() }
cd babel
go mod tidy
go build ./go_mpi_ison.go && ./go_mpi_ison
ison is not on ison is on
9.3. WorldRank
package main import ( "fmt" mpi "github.com/sbromberger/gompi" ) func main() { mpi.Start(true) rank := mpi.WorldRank() fmt.Println("rank is", rank) mpi.Stop() }
cd babel
go mod tidy
go build ./go_mpi_WorldRank.go && mpirun --use-hwthread-cpus ./go_mpi_WorldRank
rank is 0 rank is 1 rank is 2 rank is 3 rank is 5 rank is 7 rank is 6 rank is 4
9.4. WorldSize
package main import ( "fmt" mpi "github.com/sbromberger/gompi" ) func main() { mpi.Start(true) size := mpi.WorldSize() fmt.Println("rank is", size) mpi.Stop() }
cd babel
go mod tidy
go build ./go_mpi_WorldSize.go && mpirun --use-hwthread-cpus ./go_mpi_WorldSize
rank is 8 rank is 8 rank is 8 rank is 8 rank is 8 rank is 8 rank is 8 rank is 8
9.5. NewCommunicator
package main import ( "fmt" mpi "github.com/sbromberger/gompi" ) func main() { mpi.Start(true) var ranks []int newComm := mpi.NewCommunicator(ranks) fmt.Println("newComm.Size() is :", newComm.Size()) fmt.Println("newComm.Rank() is :", newComm.Rank()) mpi.Stop() }
cd babel
go mod tidy
go build ./go_mpi_NewCommunicator.go && mpirun --use-hwthread-cpus ./go_mpi_NewCommunicator
newComm.Size() is : 8 newComm.Rank() is : 1 newComm.Size() is : 8 newComm.Rank() is : 7 newComm.Size() is : 8 newComm.Rank() is : 2 newComm.Size() is : 8 newComm.Size() is : 8 newComm.Rank() is : 0 newComm.Rank() is : 4 newComm.Size() is : 8 newComm.Rank() is : 3 newComm.Size() is : 8 newComm.Rank() is : 5 newComm.Size() is : 8 newComm.Rank() is : 6
package main import ( "fmt" mpi "github.com/sbromberger/gompi" ) func main() { mpi.Start(true) var ranks []int = []int{0,1,2,3} newComm := mpi.NewCommunicator(ranks) fmt.Println("newComm.Size() is :", newComm.Size()) fmt.Println("newComm.Rank() is :", newComm.Rank()) mpi.Stop() }
cd babel
go build ./go_mpi_NewCommunicator_withranks.go && mpirun --host si-u20:4 ./go_mpi_NewCommunicator_withranks
newComm.Size() is : 4 newComm.Rank() is : 0 newComm.Size() is : 4 newComm.Rank() is : 1 newComm.Size() is : 4 newComm.Rank() is : 2 newComm.Size() is : 4 newComm.Rank() is : 3
9.6. SendInt32 && RecvInt32
package main import ( "fmt" mpi "github.com/sbromberger/gompi" ) func main() { mpi.Start(true) fmt.Println() var ranks []int newComm := mpi.NewCommunicator(ranks) if newComm.Rank() == 0 { var data int32 = 60 newComm.SendInt32(data, 1, 10) } if newComm.Rank() == 1 { data, _ := newComm.RecvInt32(0, 10) fmt.Println(data) } mpi.Stop() }
cd babel
go mod tidy
go build ./go_mpi_SendInt32.go
mpirun --use-hwthread-cpus ./go_mpi_SendInt32
60
9.7. SendInt32s && RecvInt32s
package main import ( "fmt" mpi "github.com/sbromberger/gompi" ) func main() { mpi.Start(true) fmt.Println() var ranks []int newComm := mpi.NewCommunicator(ranks) if newComm.Rank() == 0 { var data []uint32 = []uint32{1, 2, 3, 4} newComm.SendUInt32s(data, 1, 10) } if newComm.Rank() == 1 { data, _ := newComm.RecvInt32s(0, 10) fmt.Println(data) } mpi.Stop() }
cd babel
go mod tidy
go build ./go_mpi_SendUInt32s.go
mpirun --host si:6 ./go_mpi_SendUInt32s
[1 2 3 4]
9.8. SendString && RecvString
package main import ( "fmt" mpi "github.com/sbromberger/gompi" ) func main() { mpi.Start(true) fmt.Println() var ranks []int newComm := mpi.NewCommunicator(ranks) if newComm.Rank() == 0 { str1 := "str11" newComm.SendString("nihao", 1, 11) newComm.SendString(str1, 1, 10) } // better with differ tags, but with the same tags works here also if newComm.Rank() == 1 { nihao, _ := newComm.RecvString(0, 11) str1, _ := newComm.RecvString(0, 10) fmt.Println(nihao) fmt.Println(str1) } mpi.Stop() }
cd babel
go mod tidy
go build ./go_mpi_SendString.go
mpirun -n 2 ./go_mpi_SendString
nihao str11
9.9. ReduceInt32s
package main import ( "fmt" mpi "github.com/sbromberger/gompi" ) func main() { mpi.Start(true) fmt.Println() var ranks []int newComm := mpi.NewCommunicator(ranks) var dest []int32 = []int32{0, 0, 0, 0} var send []int32 = []int32{7, 2, 3, 4} newComm.ReduceInt32s(dest, send, mpi.OpSum, 0) if newComm.Rank() == 0 { fmt.Println(dest) } mpi.Stop() }
cd babel
go mod tidy
go build ./go_mpi_ReduceInt32s.go
mpirun -n 4 ./go_mpi_ReduceInt32s
[28 8 12 16]
9.10. AllreduceInt64s
package main import ( "fmt" mpi "github.com/sbromberger/gompi" ) func main() { mpi.Start(true) var ranks []int newComm := mpi.NewCommunicator(ranks) var dest []int64 = []int64{0, 0} var send []int64 = []int64{20 * int64(newComm.Rank()), 2} newComm.AllreduceInt64s(dest, send, mpi.OpSum, 0) if newComm.Rank() == 0 { fmt.Println(dest) } mpi.Stop() }
cd babel
go mod tidy
go build ./go_mpi_AllreduceInt64s.go
mpirun -n 4 ./go_mpi_AllreduceInt64s
[120 8]
9.11. BcastInt64s
package main import ( "fmt" mpi "github.com/sbromberger/gompi" ) func main() { mpi.Start(true) var ranks []int newComm := mpi.NewCommunicator(ranks) var send []int64 = []int64{10 + int64(newComm.Rank())} fmt.Printf("process %v now has original value of %v from main process \n", newComm.Rank(), send) newComm.BcastInt64s(send, 0) if newComm.Rank() != 0 { fmt.Printf("process %v now has bordcasted value of %v from main process \n", newComm.Rank(), send) } mpi.Stop() }
cd babel
go mod tidy
go build ./go_mpi_BcastInt64s.go
mpirun -n 4 ./go_mpi_BcastInt64s
process 3 now has original value of [13] from main process process 1 now has original value of [11] from main process process 0 now has original value of [10] from main process process 2 now has original value of [12] from main process process 1 now has bordcasted value of [10] from main process process 2 now has bordcasted value of [10] from main process process 3 now has bordcasted value of [10] from main process
9.12. distributed-client
package main import ( "fmt" "math/rand" "time" mpi "github.com/sbromberger/gompi" ) func main(){ mpi.Start(true) var ranks []int newComm := mpi.NewCommunicator(ranks) fmt.Println("rank is ", newComm.Rank()) start := []float64{0, 0, 0} ends := []float64{0, 0, 0} newComm.BcastFloat64s(start, 0) if newComm.Rank() != 0 { for { for i, x := range start { start[i] = x+float64(rand.Intn(10)) } time.Sleep(time.Second) fmt.Printf("process %v now has value of %v from main process \n", newComm.Rank(), start) newComm.SendFloat64s(start, 0, newComm.Rank()) start, _ = newComm.RecvFloat64s(0, newComm.Rank()) } } if newComm.Rank() == 0 { go func (){ for { start, _ := newComm.RecvFloat64s(1, 1) for i, _ := range start { start[i] = start[i] * ends[i] } newComm.SendFloat64s(start, 1, 1) } }() go func (){ for { start, _ := newComm.RecvFloat64s(2, 2) for i, _ := range start { start[i] = start[i] * ends[i] } newComm.SendFloat64s(start, 2, 2) } }() go func (){ for { start, _ := newComm.RecvFloat64s(3, 3) for i, _ := range start { start[i] = start[i] * ends[i] } newComm.SendFloat64s(start, 3, 3) } }() } mpi.Stop() }
cd babel
go mod tidy
go build ./go_mpi_distributed-client.go
mpirun -n 4 ./go_mpi_distributed-client
9.13. Distributed learning server client
package main import ( "fmt" "sync" "time" mpi "github.com/sbromberger/gompi" ) func mainn(){ mpi.Start(true) var ranks []int newComm := mpi.NewCommunicator(ranks) start := []float64{float64(newComm.Rank()), float64(newComm.Rank()), float64(newComm.Rank())} // ends := []float64{0, 0, 0} var m sync.Mutex ch1 := make(chan []float64, 3) ch2 := make(chan []float64, 3) ch3 := make(chan []float64, 3) fmt.Printf("Orignal: %v now has value of %v from main process \n", newComm.Rank(), start) newComm.BcastFloat64s(start, 0) if newComm.Rank() == 0 { go func (ch0 chan []float64){ for { start, _ := newComm.RecvFloat64s(1, 1) ch0 <- start } }(ch1) go func (ch0 chan []float64){ for { start, _ := newComm.RecvFloat64s(2, 2) ch0 <- start; } }(ch2) go func (ch0 chan []float64){ for { start, _ := newComm.RecvFloat64s(3, 3) ch0 <- start; } }(ch3) } func (newComm *mpi.Communicator ) { for { select { case <- ch1: m.Lock() data := <- ch1 for i, _ := range start { start[i] = start[i] * data[i] } m.Unlock() newComm.BcastFloat64s(start, 0) fmt.Printf("From process %v now has value of %v from main process \n", data, start) case <- ch2: m.Lock() data := <- ch2 for i, _ := range start { start[i] = start[i] * data[i] } m.Unlock() newComm.BcastFloat64s(start, 0) fmt.Printf("From process %v now has value of %v from main process \n", data, start) case <- ch3: m.Lock() data := <- ch3 for i, _ := range start { start[i] = start[i] * data[i] } m.Unlock() newComm.BcastFloat64s(start, 0) fmt.Printf("From process %v now has value of %v from main process \n", data, start) default: } } }(newComm) if newComm.Rank() != 0 { for { time.Sleep(time.Second*2) fmt.Printf("PROCESS: %v now has value of %v from main process \n", newComm.Rank(), start) for i, x := range start { start[i] = x+1 } newComm.SendFloat64s(start, 0, newComm.Rank()) } } mpi.Stop() }
9.14. Distributed learning server client one to one
package main import ( "fmt" "time" mpi "github.com/sbromberger/gompi" ) func main(){ mpi.Start(true) var ranks []int newComm := mpi.NewCommunicator(ranks) ch0 := make(chan int) start := []float64{float64(newComm.Rank()), float64(newComm.Rank()), float64(newComm.Rank())} fmt.Printf("Orignal: %v now has value of %v from main process \n", newComm.Rank(), start) go func (newComm *mpi.Communicator, ch0 chan int) { time.Sleep(time.Second*1) for { select { case data := <- ch0: for i, _ := range start { start[i] = start[i] * float64(data) } fmt.Printf("From server: %v now has value of %v from main process \n", newComm.Rank(), start) newComm.BcastFloat64s(start, 0) case <- time.After(time.Minute): fmt.Println("Time out") } } }(newComm, ch0) if newComm.Rank() == 0 { go func (ch0 chan int ){ for { newComm.RecvFloat64s(1, 1) ch0 <- 1; } }(ch0) } if newComm.Rank() == 1 { for { time.Sleep(time.Second*2) for i, _ := range start { start[i] = start[i] + 2 } fmt.Printf("From client: %v now has value of %v from main process \n", newComm.Rank(), start) newComm.SendFloat64s(start, 0, newComm.Rank()) } } time.Sleep(time.Minute) mpi.Stop() }
10. GPGPU
10.1. get variables
#include <stdio.h> #include <cuda.h> #include <assert.h> #include <cuda_runtime.h> #include <curand_kernel.h> #define N 32 __global__ void add( int *dthreadIdx_x, int *dthreadIdx_y, int *dthreadIdx_z, int *dblockIdx_x, int *dblockIdx_y, int *dblockIdx_z, int *dblockDim_x, int *dblockDim_y, int *dblockDim_z, int *dgridDim_x, int *dgridDim_y, int *dgridDim_z){ *dthreadIdx_x = threadIdx.x; *dthreadIdx_y = threadIdx.y; *dthreadIdx_z = threadIdx.z; *dblockIdx_x = blockIdx.x; *dblockIdx_y = blockIdx.y; *dblockIdx_z = blockIdx.z; *dblockDim_x = blockDim.x; *dblockDim_y = blockDim.y; *dblockDim_z = blockDim.z; *dgridDim_x = gridDim.x; *dgridDim_y = gridDim.y; *dgridDim_z = gridDim.z; } int main() { int threadIdx_x = 0; int threadIdx_y = 0; int threadIdx_z = 0; int blockIdx_x = 0; int blockIdx_y = 0; int blockIdx_z = 0; int blockDim_x = 0; int blockDim_y = 0; int blockDim_z = 0; int gridDim_x = 0; int gridDim_y = 0; int gridDim_z = 0; int *dthreadIdx_x; int *dthreadIdx_y; int *dthreadIdx_z; int *dblockIdx_x; int *dblockIdx_y; int *dblockIdx_z; int *dblockDim_x; int *dblockDim_y; int *dblockDim_z; int *dgridDim_x; int *dgridDim_y; int *dgridDim_z; cudaMalloc((void **)&dthreadIdx_x, sizeof(int)); cudaMalloc((void **)&dthreadIdx_y, sizeof(int)); cudaMalloc((void **)&dthreadIdx_z, sizeof(int)); cudaMalloc((void **)&dblockIdx_x, sizeof(int)); cudaMalloc((void **)&dblockIdx_y, sizeof(int)); cudaMalloc((void **)&dblockIdx_z, sizeof(int)); cudaMalloc((void **)&dblockDim_x, sizeof(int)); cudaMalloc((void **)&dblockDim_y, sizeof(int)); cudaMalloc((void **)&dblockDim_z, sizeof(int)); cudaMalloc((void **)&dgridDim_x, sizeof(int)); cudaMalloc((void **)&dgridDim_y, sizeof(int)); cudaMalloc((void **)&dgridDim_z, sizeof(int)); dim3 BlockPerGrid(1, 1, 1); dim3 ThreadsPerBlock(N, N, 1); add <<< BlockPerGrid, ThreadsPerBlock >>>( dthreadIdx_x, dthreadIdx_y, dthreadIdx_z, dblockIdx_x, dblockIdx_y, dblockIdx_z, dblockDim_x, dblockDim_y, dblockDim_z, dgridDim_x, dgridDim_y, dgridDim_z ); cudaMemcpy(&threadIdx_x,dthreadIdx_x, sizeof(int), cudaMemcpyDeviceToHost); printf("threadIdx_x is %d \n", threadIdx_x); cudaMemcpy(&threadIdx_y,dthreadIdx_y, sizeof(int), cudaMemcpyDeviceToHost); printf("threadIdx_y is %d \n", threadIdx_y); cudaMemcpy(&threadIdx_z,dthreadIdx_z, sizeof(int), cudaMemcpyDeviceToHost); printf("threadIdx_z is %d \n", threadIdx_z); cudaMemcpy(&blockIdx_x, dblockIdx_x, sizeof(int), cudaMemcpyDeviceToHost); printf("blockIdx_x is %d \n", blockIdx_x); cudaMemcpy(&blockIdx_y, dblockIdx_y, sizeof(int), cudaMemcpyDeviceToHost); printf("blockIdx_y is %d \n", blockIdx_y); cudaMemcpy(&blockIdx_z, dblockIdx_z, sizeof(int), cudaMemcpyDeviceToHost); printf("blockIdx_z is %d \n", blockIdx_z); cudaMemcpy(&blockDim_x, dblockDim_x, sizeof(int), cudaMemcpyDeviceToHost); printf("blockDim_x is %d \n", blockDim_x); cudaMemcpy(&blockDim_y, dblockDim_y, sizeof(int), cudaMemcpyDeviceToHost); printf("blockDim_y is %d \n", blockDim_y); cudaMemcpy(&blockDim_z, dblockDim_z, sizeof(int), cudaMemcpyDeviceToHost); printf("blockDim_z is %d \n", blockDim_z); cudaMemcpy(&gridDim_x, dgridDim_x, sizeof(int), cudaMemcpyDeviceToHost); printf("gridDim_x is %d \n", gridDim_x); cudaMemcpy(&gridDim_y, dgridDim_y, sizeof(int), cudaMemcpyDeviceToHost); printf("gridDim_y is %d \n", gridDim_y); cudaMemcpy(&gridDim_z, dgridDim_z, sizeof(int), cudaMemcpyDeviceToHost); printf("gridDim_z is %d \n", gridDim_z); return 0; }
cd babel
nvcc gpu_get_variables.cu -o gpu_get_variables
./gpu_get_variables
10.2. Addition dim 1
10.2.1. without parallelism
#include <stdio.h> #include <stdlib.h> #include <math.h> #include <assert.h> #include <cuda.h> #include <cuda_runtime.h> #define N 10000000 #define MAX_ERR 1e-6 __global__ void vector_add(float *out, float *a, float *b, int n) { for(int i = 0; i < n; i ++){ out[i] = a[i] + b[i]; } } int main(){ float *a, *b, *out; float *d_a, *d_b, *d_out; // Allocate host memory a = (float*)malloc(sizeof(float) * N); b = (float*)malloc(sizeof(float) * N); out = (float*)malloc(sizeof(float) * N); // Initialize host arrays for(int i = 0; i < N; i++){ a[i] = 1.0f; b[i] = 2.0f; } // Allocate device memory cudaMalloc((void**)&d_a, sizeof(float) * N); cudaMalloc((void**)&d_b, sizeof(float) * N); cudaMalloc((void**)&d_out, sizeof(float) * N); // Transfer data from host to device memory cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice); // Executing kernel vector_add<<<1,1>>>(d_out, d_a, d_b, N); // Transfer data back to host memory cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost); // Verification for(int i = 0; i < N; i++){ assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR); } printf("out[0] = %f\n", out[0]); printf("PASSED\n"); // Deallocate device memory cudaFree(d_a); cudaFree(d_b); cudaFree(d_out); // Deallocate host memory free(a); free(b); free(out); }
cd babel
nvcc gpu_vector_add_withoutpara.cu -o gpu_vector_add_withoutpara
./gpu_vector_add_withoutpara
out[0] = 3.000000 PASSED
10.2.2. dim 1, grid 1, block 256 N10000000
#include <stdio.h> #include <stdlib.h> #include <math.h> #include <assert.h> #include <cuda.h> #include <cuda_runtime.h> #define N 10000000 #define MAX_ERR 1e-6 __global__ void vector_add(float *out, float *a, float *b, int n) { int index = threadIdx.x; int stride = blockDim.x; for(int i = index; i < n; i += stride){ out[i] = a[i] + b[i]; } } int main(){ float *a, *b, *out; float *d_a, *d_b, *d_out; // Allocate host memory a = (float*)malloc(sizeof(float) * N); b = (float*)malloc(sizeof(float) * N); out = (float*)malloc(sizeof(float) * N); // Initialize host arrays for(int i = 0; i < N; i++){ a[i] = 1.0f; b[i] = 2.0f; } // Allocate device memory cudaMalloc((void**)&d_a, sizeof(float) * N); cudaMalloc((void**)&d_b, sizeof(float) * N); cudaMalloc((void**)&d_out, sizeof(float) * N); // Transfer data from host to device memory cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice); // Executing kernel vector_add<<<1,256>>>(d_out, d_a, d_b, N); // Transfer data back to host memory cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost); // Verification for(int i = 0; i < N; i++){ assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR); } printf("PASSED\n"); // Deallocate device memory cudaFree(d_a); cudaFree(d_b); cudaFree(d_out); // Deallocate host memory free(a); free(b); free(out); }
cd babel
nvcc gpu_vector_add_dim1_grid1_block256.cu -o gpu_vector_add_dim1_grid1_block256
./gpu_vector_add_dim1_grid1_block256
10.2.3. dim 1, grid 1, block 256 N10000000 with time
#include <stdio.h> #include <stdlib.h> #include <math.h> #include <assert.h> #include <cuda.h> #include <cuda_runtime.h> #define N 10000000 #define MAX_ERR 1e-6 __global__ void vector_add(float *out, float *a, float *b, int n) { int index = threadIdx.x; int stride = blockDim.x; for(int i = index; i < n; i += stride){ out[i] = a[i] + b[i]; } } int main(){ float *a, *b, *out; float *d_a, *d_b, *d_out; // Allocate host memory a = (float*)malloc(sizeof(float) * N); b = (float*)malloc(sizeof(float) * N); out = (float*)malloc(sizeof(float) * N); // Initialize host arrays for(int i = 0; i < N; i++){ a[i] = 1.0f; b[i] = 2.0f; } // Allocate device memory cudaMalloc((void**)&d_a, sizeof(float) * N); cudaMalloc((void**)&d_b, sizeof(float) * N); cudaMalloc((void**)&d_out, sizeof(float) * N); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); // Transfer data from host to device memory cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice); // Executing kernel vector_add<<<1,256>>>(d_out, d_a, d_b, N); // Transfer data back to host memory cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); // Verification for(int i = 0; i < N; i++){ assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR); } printf("PASSED with %f ms\n", elapsedTime); cudaEventDestroy(start); cudaEventDestroy(stop); // Deallocate device memory cudaFree(d_a); cudaFree(d_b); cudaFree(d_out); // Deallocate host memory free(a); free(b); free(out); }
cd babel
nvcc gpu_vector_add_dim1_grid1_block256_withtime.cu -o gpu_vector_add_dim1_grid1_block256_withtime
./gpu_vector_add_dim1_grid1_block256_withtime
10.2.4. dim 1, grid N, block 1, (N=256)
#include <stdio.h> #include <cuda.h> #include <cuda_runtime.h> #include <curand_kernel.h> #define N 512 __global__ void add(int *a, int *b, int *c){ int tid = blockIdx.x; // handle the data at this index if(tid < N) c[tid] = a[tid] + b[tid]; } int main() { int a[N], b[N], c[N], i; int *dev_a, *dev_b, *dev_c; cudaMalloc((void**)&dev_c, N*sizeof(int)); cudaMalloc((void**)&dev_b, N*sizeof(int)); cudaMalloc((void**)&dev_a, N*sizeof(int)); for(i=0; i < N; i++) { a[i] = -i; b[i] = i*i*i; } cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice); add <<<N, 1>>>(dev_a, dev_b, dev_c); cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost); for(i=0; i < N; i++) printf("%d + %d = %d\n", a[i], b[i], c[i]); cudaFree(dev_c); cudaFree(dev_b); cudaFree(dev_a); return 0; }
cd babel
nvcc dim1_gridN_block1.cu -o dim1_gridN_block1
./dim1_gridN_block1
0 + 0 = 0 -1 + 1 = 0 -2 + 8 = 6 -3 + 27 = 24 -4 + 64 = 60 -5 + 125 = 120 -6 + 216 = 210 -7 + 343 = 336 -8 + 512 = 504 -9 + 729 = 720 -10 + 1000 = 990 -11 + 1331 = 1320 -12 + 1728 = 1716 -13 + 2197 = 2184 -14 + 2744 = 2730 -15 + 3375 = 3360 -16 + 4096 = 4080 -17 + 4913 = 4896 -18 + 5832 = 5814 -19 + 6859 = 6840 -20 + 8000 = 7980 -21 + 9261 = 9240 -22 + 10648 = 10626 -23 + 12167 = 12144 -24 + 13824 = 13800 -25 + 15625 = 15600 -26 + 17576 = 17550 -27 + 19683 = 19656 -28 + 21952 = 21924 -29 + 24389 = 24360 -30 + 27000 = 26970 -31 + 29791 = 29760 -32 + 32768 = 32736 -33 + 35937 = 35904 -34 + 39304 = 39270 -35 + 42875 = 42840 -36 + 46656 = 46620 -37 + 50653 = 50616 -38 + 54872 = 54834 -39 + 59319 = 59280 -40 + 64000 = 63960 -41 + 68921 = 68880 -42 + 74088 = 74046 -43 + 79507 = 79464 -44 + 85184 = 85140 -45 + 91125 = 91080 -46 + 97336 = 97290 -47 + 103823 = 103776 -48 + 110592 = 110544 -49 + 117649 = 117600 -50 + 125000 = 124950 -51 + 132651 = 132600 -52 + 140608 = 140556 -53 + 148877 = 148824 -54 + 157464 = 157410 -55 + 166375 = 166320 -56 + 175616 = 175560 -57 + 185193 = 185136 -58 + 195112 = 195054 -59 + 205379 = 205320 -60 + 216000 = 215940 -61 + 226981 = 226920 -62 + 238328 = 238266 -63 + 250047 = 249984 -64 + 262144 = 262080 -65 + 274625 = 274560 -66 + 287496 = 287430 -67 + 300763 = 300696 -68 + 314432 = 314364 -69 + 328509 = 328440 -70 + 343000 = 342930 -71 + 357911 = 357840 -72 + 373248 = 373176 -73 + 389017 = 388944 -74 + 405224 = 405150 -75 + 421875 = 421800 -76 + 438976 = 438900 -77 + 456533 = 456456 -78 + 474552 = 474474 -79 + 493039 = 492960 -80 + 512000 = 511920 -81 + 531441 = 531360 -82 + 551368 = 551286 -83 + 571787 = 571704 -84 + 592704 = 592620 -85 + 614125 = 614040 -86 + 636056 = 635970 -87 + 658503 = 658416 -88 + 681472 = 681384 -89 + 704969 = 704880 -90 + 729000 = 728910 -91 + 753571 = 753480 -92 + 778688 = 778596 -93 + 804357 = 804264 -94 + 830584 = 830490 -95 + 857375 = 857280 -96 + 884736 = 884640 -97 + 912673 = 912576 -98 + 941192 = 941094 -99 + 970299 = 970200 -100 + 1000000 = 999900 -101 + 1030301 = 1030200 -102 + 1061208 = 1061106 -103 + 1092727 = 1092624 -104 + 1124864 = 1124760 -105 + 1157625 = 1157520 -106 + 1191016 = 1190910 -107 + 1225043 = 1224936 -108 + 1259712 = 1259604 -109 + 1295029 = 1294920 -110 + 1331000 = 1330890 -111 + 1367631 = 1367520 -112 + 1404928 = 1404816 -113 + 1442897 = 1442784 -114 + 1481544 = 1481430 -115 + 1520875 = 1520760 -116 + 1560896 = 1560780 -117 + 1601613 = 1601496 -118 + 1643032 = 1642914 -119 + 1685159 = 1685040 -120 + 1728000 = 1727880 -121 + 1771561 = 1771440 -122 + 1815848 = 1815726 -123 + 1860867 = 1860744 -124 + 1906624 = 1906500 -125 + 1953125 = 1953000 -126 + 2000376 = 2000250 -127 + 2048383 = 2048256 -128 + 2097152 = 2097024 -129 + 2146689 = 2146560 -130 + 2197000 = 2196870 -131 + 2248091 = 2247960 -132 + 2299968 = 2299836 -133 + 2352637 = 2352504 -134 + 2406104 = 2405970 -135 + 2460375 = 2460240 -136 + 2515456 = 2515320 -137 + 2571353 = 2571216 -138 + 2628072 = 2627934 -139 + 2685619 = 2685480 -140 + 2744000 = 2743860 -141 + 2803221 = 2803080 -142 + 2863288 = 2863146 -143 + 2924207 = 2924064 -144 + 2985984 = 2985840 -145 + 3048625 = 3048480 -146 + 3112136 = 3111990 -147 + 3176523 = 3176376 -148 + 3241792 = 3241644 -149 + 3307949 = 3307800 -150 + 3375000 = 3374850 -151 + 3442951 = 3442800 -152 + 3511808 = 3511656 -153 + 3581577 = 3581424 -154 + 3652264 = 3652110 -155 + 3723875 = 3723720 -156 + 3796416 = 3796260 -157 + 3869893 = 3869736 -158 + 3944312 = 3944154 -159 + 4019679 = 4019520 -160 + 4096000 = 4095840 -161 + 4173281 = 4173120 -162 + 4251528 = 4251366 -163 + 4330747 = 4330584 -164 + 4410944 = 4410780 -165 + 4492125 = 4491960 -166 + 4574296 = 4574130 -167 + 4657463 = 4657296 -168 + 4741632 = 4741464 -169 + 4826809 = 4826640 -170 + 4913000 = 4912830 -171 + 5000211 = 5000040 -172 + 5088448 = 5088276 -173 + 5177717 = 5177544 -174 + 5268024 = 5267850 -175 + 5359375 = 5359200 -176 + 5451776 = 5451600 -177 + 5545233 = 5545056 -178 + 5639752 = 5639574 -179 + 5735339 = 5735160 -180 + 5832000 = 5831820 -181 + 5929741 = 5929560 -182 + 6028568 = 6028386 -183 + 6128487 = 6128304 -184 + 6229504 = 6229320 -185 + 6331625 = 6331440 -186 + 6434856 = 6434670 -187 + 6539203 = 6539016 -188 + 6644672 = 6644484 -189 + 6751269 = 6751080 -190 + 6859000 = 6858810 -191 + 6967871 = 6967680 -192 + 7077888 = 7077696 -193 + 7189057 = 7188864 -194 + 7301384 = 7301190 -195 + 7414875 = 7414680 -196 + 7529536 = 7529340 -197 + 7645373 = 7645176 -198 + 7762392 = 7762194 -199 + 7880599 = 7880400 -200 + 8000000 = 7999800 -201 + 8120601 = 8120400 -202 + 8242408 = 8242206 -203 + 8365427 = 8365224 -204 + 8489664 = 8489460 -205 + 8615125 = 8614920 -206 + 8741816 = 8741610 -207 + 8869743 = 8869536 -208 + 8998912 = 8998704 -209 + 9129329 = 9129120 -210 + 9261000 = 9260790 -211 + 9393931 = 9393720 -212 + 9528128 = 9527916 -213 + 9663597 = 9663384 -214 + 9800344 = 9800130 -215 + 9938375 = 9938160 -216 + 10077696 = 10077480 -217 + 10218313 = 10218096 -218 + 10360232 = 10360014 -219 + 10503459 = 10503240 -220 + 10648000 = 10647780 -221 + 10793861 = 10793640 -222 + 10941048 = 10940826 -223 + 11089567 = 11089344 -224 + 11239424 = 11239200 -225 + 11390625 = 11390400 -226 + 11543176 = 11542950 -227 + 11697083 = 11696856 -228 + 11852352 = 11852124 -229 + 12008989 = 12008760 -230 + 12167000 = 12166770 -231 + 12326391 = 12326160 -232 + 12487168 = 12486936 -233 + 12649337 = 12649104 -234 + 12812904 = 12812670 -235 + 12977875 = 12977640 -236 + 13144256 = 13144020 -237 + 13312053 = 13311816 -238 + 13481272 = 13481034 -239 + 13651919 = 13651680 -240 + 13824000 = 13823760 -241 + 13997521 = 13997280 -242 + 14172488 = 14172246 -243 + 14348907 = 14348664 -244 + 14526784 = 14526540 -245 + 14706125 = 14705880 -246 + 14886936 = 14886690 -247 + 15069223 = 15068976 -248 + 15252992 = 15252744 -249 + 15438249 = 15438000 -250 + 15625000 = 15624750 -251 + 15813251 = 15813000 -252 + 16003008 = 16002756 -253 + 16194277 = 16194024 -254 + 16387064 = 16386810 -255 + 16581375 = 16581120 -256 + 16777216 = 16776960 -257 + 16974593 = 16974336 -258 + 17173512 = 17173254 -259 + 17373979 = 17373720 -260 + 17576000 = 17575740 -261 + 17779581 = 17779320 -262 + 17984728 = 17984466 -263 + 18191447 = 18191184 -264 + 18399744 = 18399480 -265 + 18609625 = 18609360 -266 + 18821096 = 18820830 -267 + 19034163 = 19033896 -268 + 19248832 = 19248564 -269 + 19465109 = 19464840 -270 + 19683000 = 19682730 -271 + 19902511 = 19902240 -272 + 20123648 = 20123376 -273 + 20346417 = 20346144 -274 + 20570824 = 20570550 -275 + 20796875 = 20796600 -276 + 21024576 = 21024300 -277 + 21253933 = 21253656 -278 + 21484952 = 21484674 -279 + 21717639 = 21717360 -280 + 21952000 = 21951720 -281 + 22188041 = 22187760 -282 + 22425768 = 22425486 -283 + 22665187 = 22664904 -284 + 22906304 = 22906020 -285 + 23149125 = 23148840 -286 + 23393656 = 23393370 -287 + 23639903 = 23639616 -288 + 23887872 = 23887584 -289 + 24137569 = 24137280 -290 + 24389000 = 24388710 -291 + 24642171 = 24641880 -292 + 24897088 = 24896796 -293 + 25153757 = 25153464 -294 + 25412184 = 25411890 -295 + 25672375 = 25672080 -296 + 25934336 = 25934040 -297 + 26198073 = 26197776 -298 + 26463592 = 26463294 -299 + 26730899 = 26730600 -300 + 27000000 = 26999700 -301 + 27270901 = 27270600 -302 + 27543608 = 27543306 -303 + 27818127 = 27817824 -304 + 28094464 = 28094160 -305 + 28372625 = 28372320 -306 + 28652616 = 28652310 -307 + 28934443 = 28934136 -308 + 29218112 = 29217804 -309 + 29503629 = 29503320 -310 + 29791000 = 29790690 -311 + 30080231 = 30079920 -312 + 30371328 = 30371016 -313 + 30664297 = 30663984 -314 + 30959144 = 30958830 -315 + 31255875 = 31255560 -316 + 31554496 = 31554180 -317 + 31855013 = 31854696 -318 + 32157432 = 32157114 -319 + 32461759 = 32461440 -320 + 32768000 = 32767680 -321 + 33076161 = 33075840 -322 + 33386248 = 33385926 -323 + 33698267 = 33697944 -324 + 34012224 = 34011900 -325 + 34328125 = 34327800 -326 + 34645976 = 34645650 -327 + 34965783 = 34965456 -328 + 35287552 = 35287224 -329 + 35611289 = 35610960 -330 + 35937000 = 35936670 -331 + 36264691 = 36264360 -332 + 36594368 = 36594036 -333 + 36926037 = 36925704 -334 + 37259704 = 37259370 -335 + 37595375 = 37595040 -336 + 37933056 = 37932720 -337 + 38272753 = 38272416 -338 + 38614472 = 38614134 -339 + 38958219 = 38957880 -340 + 39304000 = 39303660 -341 + 39651821 = 39651480 -342 + 40001688 = 40001346 -343 + 40353607 = 40353264 -344 + 40707584 = 40707240 -345 + 41063625 = 41063280 -346 + 41421736 = 41421390 -347 + 41781923 = 41781576 -348 + 42144192 = 42143844 -349 + 42508549 = 42508200 -350 + 42875000 = 42874650 -351 + 43243551 = 43243200 -352 + 43614208 = 43613856 -353 + 43986977 = 43986624 -354 + 44361864 = 44361510 -355 + 44738875 = 44738520 -356 + 45118016 = 45117660 -357 + 45499293 = 45498936 -358 + 45882712 = 45882354 -359 + 46268279 = 46267920 -360 + 46656000 = 46655640 -361 + 47045881 = 47045520 -362 + 47437928 = 47437566 -363 + 47832147 = 47831784 -364 + 48228544 = 48228180 -365 + 48627125 = 48626760 -366 + 49027896 = 49027530 -367 + 49430863 = 49430496 -368 + 49836032 = 49835664 -369 + 50243409 = 50243040 -370 + 50653000 = 50652630 -371 + 51064811 = 51064440 -372 + 51478848 = 51478476 -373 + 51895117 = 51894744 -374 + 52313624 = 52313250 -375 + 52734375 = 52734000 -376 + 53157376 = 53157000 -377 + 53582633 = 53582256 -378 + 54010152 = 54009774 -379 + 54439939 = 54439560 -380 + 54872000 = 54871620 -381 + 55306341 = 55305960 -382 + 55742968 = 55742586 -383 + 56181887 = 56181504 -384 + 56623104 = 56622720 -385 + 57066625 = 57066240 -386 + 57512456 = 57512070 -387 + 57960603 = 57960216 -388 + 58411072 = 58410684 -389 + 58863869 = 58863480 -390 + 59319000 = 59318610 -391 + 59776471 = 59776080 -392 + 60236288 = 60235896 -393 + 60698457 = 60698064 -394 + 61162984 = 61162590 -395 + 61629875 = 61629480 -396 + 62099136 = 62098740 -397 + 62570773 = 62570376 -398 + 63044792 = 63044394 -399 + 63521199 = 63520800 -400 + 64000000 = 63999600 -401 + 64481201 = 64480800 -402 + 64964808 = 64964406 -403 + 65450827 = 65450424 -404 + 65939264 = 65938860 -405 + 66430125 = 66429720 -406 + 66923416 = 66923010 -407 + 67419143 = 67418736 -408 + 67917312 = 67916904 -409 + 68417929 = 68417520 -410 + 68921000 = 68920590 -411 + 69426531 = 69426120 -412 + 69934528 = 69934116 -413 + 70444997 = 70444584 -414 + 70957944 = 70957530 -415 + 71473375 = 71472960 -416 + 71991296 = 71990880 -417 + 72511713 = 72511296 -418 + 73034632 = 73034214 -419 + 73560059 = 73559640 -420 + 74088000 = 74087580 -421 + 74618461 = 74618040 -422 + 75151448 = 75151026 -423 + 75686967 = 75686544 -424 + 76225024 = 76224600 -425 + 76765625 = 76765200 -426 + 77308776 = 77308350 -427 + 77854483 = 77854056 -428 + 78402752 = 78402324 -429 + 78953589 = 78953160 -430 + 79507000 = 79506570 -431 + 80062991 = 80062560 -432 + 80621568 = 80621136 -433 + 81182737 = 81182304 -434 + 81746504 = 81746070 -435 + 82312875 = 82312440 -436 + 82881856 = 82881420 -437 + 83453453 = 83453016 -438 + 84027672 = 84027234 -439 + 84604519 = 84604080 -440 + 85184000 = 85183560 -441 + 85766121 = 85765680 -442 + 86350888 = 86350446 -443 + 86938307 = 86937864 -444 + 87528384 = 87527940 -445 + 88121125 = 88120680 -446 + 88716536 = 88716090 -447 + 89314623 = 89314176 -448 + 89915392 = 89914944 -449 + 90518849 = 90518400 -450 + 91125000 = 91124550 -451 + 91733851 = 91733400 -452 + 92345408 = 92344956 -453 + 92959677 = 92959224 -454 + 93576664 = 93576210 -455 + 94196375 = 94195920 -456 + 94818816 = 94818360 -457 + 95443993 = 95443536 -458 + 96071912 = 96071454 -459 + 96702579 = 96702120 -460 + 97336000 = 97335540 -461 + 97972181 = 97971720 -462 + 98611128 = 98610666 -463 + 99252847 = 99252384 -464 + 99897344 = 99896880 -465 + 100544625 = 100544160 -466 + 101194696 = 101194230 -467 + 101847563 = 101847096 -468 + 102503232 = 102502764 -469 + 103161709 = 103161240 -470 + 103823000 = 103822530 -471 + 104487111 = 104486640 -472 + 105154048 = 105153576 -473 + 105823817 = 105823344 -474 + 106496424 = 106495950 -475 + 107171875 = 107171400 -476 + 107850176 = 107849700 -477 + 108531333 = 108530856 -478 + 109215352 = 109214874 -479 + 109902239 = 109901760 -480 + 110592000 = 110591520 -481 + 111284641 = 111284160 -482 + 111980168 = 111979686 -483 + 112678587 = 112678104 -484 + 113379904 = 113379420 -485 + 114084125 = 114083640 -486 + 114791256 = 114790770 -487 + 115501303 = 115500816 -488 + 116214272 = 116213784 -489 + 116930169 = 116929680 -490 + 117649000 = 117648510 -491 + 118370771 = 118370280 -492 + 119095488 = 119094996 -493 + 119823157 = 119822664 -494 + 120553784 = 120553290 -495 + 121287375 = 121286880 -496 + 122023936 = 122023440 -497 + 122763473 = 122762976 -498 + 123505992 = 123505494 -499 + 124251499 = 124251000 -500 + 125000000 = 124999500 -501 + 125751501 = 125751000 -502 + 126506008 = 126505506 -503 + 127263527 = 127263024 -504 + 128024064 = 128023560 -505 + 128787625 = 128787120 -506 + 129554216 = 129553710 -507 + 130323843 = 130323336 -508 + 131096512 = 131096004 -509 + 131872229 = 131871720 -510 + 132651000 = 132650490 -511 + 133432831 = 133432320
10.2.5. dim 1, grid 1, block N, (N=256)
#include <stdio.h> #include <cuda.h> #include <cuda_runtime.h> #include <curand_kernel.h> #define N 512 __global__ void add(int *a, int *b, int *c){ int tid = threadIdx.x; // handle the data at this index if(tid < N) c[tid] = a[tid] + b[tid]; } int main() { int a[N], b[N], c[N], i; int *dev_a, *dev_b, *dev_c; cudaMalloc((void**)&dev_c, N*sizeof(int)); cudaMalloc((void**)&dev_b, N*sizeof(int)); cudaMalloc((void**)&dev_a, N*sizeof(int)); for(i=0; i < N; i++) { a[i] = -i; b[i] = i*i*i; } cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice); add <<<1, N>>>(dev_a, dev_b, dev_c); cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost); for(i=0; i < N; i++) printf("%d + %d = %d\n", a[i], b[i], c[i]); cudaFree(dev_c); cudaFree(dev_b); cudaFree(dev_a); return 0; }
cd babel
nvcc dim1_gridN_block1_N256.cu -o dim1_gridN_block1_N256
./dim1_gridN_block1_N256
0 + 0 = 0 -1 + 1 = 0 -2 + 8 = 6 -3 + 27 = 24 -4 + 64 = 60 -5 + 125 = 120 -6 + 216 = 210 -7 + 343 = 336 -8 + 512 = 504 -9 + 729 = 720 -10 + 1000 = 990 -11 + 1331 = 1320 -12 + 1728 = 1716 -13 + 2197 = 2184 -14 + 2744 = 2730 -15 + 3375 = 3360 -16 + 4096 = 4080 -17 + 4913 = 4896 -18 + 5832 = 5814 -19 + 6859 = 6840 -20 + 8000 = 7980 -21 + 9261 = 9240 -22 + 10648 = 10626 -23 + 12167 = 12144 -24 + 13824 = 13800 -25 + 15625 = 15600 -26 + 17576 = 17550 -27 + 19683 = 19656 -28 + 21952 = 21924 -29 + 24389 = 24360 -30 + 27000 = 26970 -31 + 29791 = 29760 -32 + 32768 = 32736 -33 + 35937 = 35904 -34 + 39304 = 39270 -35 + 42875 = 42840 -36 + 46656 = 46620 -37 + 50653 = 50616 -38 + 54872 = 54834 -39 + 59319 = 59280 -40 + 64000 = 63960 -41 + 68921 = 68880 -42 + 74088 = 74046 -43 + 79507 = 79464 -44 + 85184 = 85140 -45 + 91125 = 91080 -46 + 97336 = 97290 -47 + 103823 = 103776 -48 + 110592 = 110544 -49 + 117649 = 117600 -50 + 125000 = 124950 -51 + 132651 = 132600 -52 + 140608 = 140556 -53 + 148877 = 148824 -54 + 157464 = 157410 -55 + 166375 = 166320 -56 + 175616 = 175560 -57 + 185193 = 185136 -58 + 195112 = 195054 -59 + 205379 = 205320 -60 + 216000 = 215940 -61 + 226981 = 226920 -62 + 238328 = 238266 -63 + 250047 = 249984 -64 + 262144 = 262080 -65 + 274625 = 274560 -66 + 287496 = 287430 -67 + 300763 = 300696 -68 + 314432 = 314364 -69 + 328509 = 328440 -70 + 343000 = 342930 -71 + 357911 = 357840 -72 + 373248 = 373176 -73 + 389017 = 388944 -74 + 405224 = 405150 -75 + 421875 = 421800 -76 + 438976 = 438900 -77 + 456533 = 456456 -78 + 474552 = 474474 -79 + 493039 = 492960 -80 + 512000 = 511920 -81 + 531441 = 531360 -82 + 551368 = 551286 -83 + 571787 = 571704 -84 + 592704 = 592620 -85 + 614125 = 614040 -86 + 636056 = 635970 -87 + 658503 = 658416 -88 + 681472 = 681384 -89 + 704969 = 704880 -90 + 729000 = 728910 -91 + 753571 = 753480 -92 + 778688 = 778596 -93 + 804357 = 804264 -94 + 830584 = 830490 -95 + 857375 = 857280 -96 + 884736 = 884640 -97 + 912673 = 912576 -98 + 941192 = 941094 -99 + 970299 = 970200 -100 + 1000000 = 999900 -101 + 1030301 = 1030200 -102 + 1061208 = 1061106 -103 + 1092727 = 1092624 -104 + 1124864 = 1124760 -105 + 1157625 = 1157520 -106 + 1191016 = 1190910 -107 + 1225043 = 1224936 -108 + 1259712 = 1259604 -109 + 1295029 = 1294920 -110 + 1331000 = 1330890 -111 + 1367631 = 1367520 -112 + 1404928 = 1404816 -113 + 1442897 = 1442784 -114 + 1481544 = 1481430 -115 + 1520875 = 1520760 -116 + 1560896 = 1560780 -117 + 1601613 = 1601496 -118 + 1643032 = 1642914 -119 + 1685159 = 1685040 -120 + 1728000 = 1727880 -121 + 1771561 = 1771440 -122 + 1815848 = 1815726 -123 + 1860867 = 1860744 -124 + 1906624 = 1906500 -125 + 1953125 = 1953000 -126 + 2000376 = 2000250 -127 + 2048383 = 2048256 -128 + 2097152 = 2097024 -129 + 2146689 = 2146560 -130 + 2197000 = 2196870 -131 + 2248091 = 2247960 -132 + 2299968 = 2299836 -133 + 2352637 = 2352504 -134 + 2406104 = 2405970 -135 + 2460375 = 2460240 -136 + 2515456 = 2515320 -137 + 2571353 = 2571216 -138 + 2628072 = 2627934 -139 + 2685619 = 2685480 -140 + 2744000 = 2743860 -141 + 2803221 = 2803080 -142 + 2863288 = 2863146 -143 + 2924207 = 2924064 -144 + 2985984 = 2985840 -145 + 3048625 = 3048480 -146 + 3112136 = 3111990 -147 + 3176523 = 3176376 -148 + 3241792 = 3241644 -149 + 3307949 = 3307800 -150 + 3375000 = 3374850 -151 + 3442951 = 3442800 -152 + 3511808 = 3511656 -153 + 3581577 = 3581424 -154 + 3652264 = 3652110 -155 + 3723875 = 3723720 -156 + 3796416 = 3796260 -157 + 3869893 = 3869736 -158 + 3944312 = 3944154 -159 + 4019679 = 4019520 -160 + 4096000 = 4095840 -161 + 4173281 = 4173120 -162 + 4251528 = 4251366 -163 + 4330747 = 4330584 -164 + 4410944 = 4410780 -165 + 4492125 = 4491960 -166 + 4574296 = 4574130 -167 + 4657463 = 4657296 -168 + 4741632 = 4741464 -169 + 4826809 = 4826640 -170 + 4913000 = 4912830 -171 + 5000211 = 5000040 -172 + 5088448 = 5088276 -173 + 5177717 = 5177544 -174 + 5268024 = 5267850 -175 + 5359375 = 5359200 -176 + 5451776 = 5451600 -177 + 5545233 = 5545056 -178 + 5639752 = 5639574 -179 + 5735339 = 5735160 -180 + 5832000 = 5831820 -181 + 5929741 = 5929560 -182 + 6028568 = 6028386 -183 + 6128487 = 6128304 -184 + 6229504 = 6229320 -185 + 6331625 = 6331440 -186 + 6434856 = 6434670 -187 + 6539203 = 6539016 -188 + 6644672 = 6644484 -189 + 6751269 = 6751080 -190 + 6859000 = 6858810 -191 + 6967871 = 6967680 -192 + 7077888 = 7077696 -193 + 7189057 = 7188864 -194 + 7301384 = 7301190 -195 + 7414875 = 7414680 -196 + 7529536 = 7529340 -197 + 7645373 = 7645176 -198 + 7762392 = 7762194 -199 + 7880599 = 7880400 -200 + 8000000 = 7999800 -201 + 8120601 = 8120400 -202 + 8242408 = 8242206 -203 + 8365427 = 8365224 -204 + 8489664 = 8489460 -205 + 8615125 = 8614920 -206 + 8741816 = 8741610 -207 + 8869743 = 8869536 -208 + 8998912 = 8998704 -209 + 9129329 = 9129120 -210 + 9261000 = 9260790 -211 + 9393931 = 9393720 -212 + 9528128 = 9527916 -213 + 9663597 = 9663384 -214 + 9800344 = 9800130 -215 + 9938375 = 9938160 -216 + 10077696 = 10077480 -217 + 10218313 = 10218096 -218 + 10360232 = 10360014 -219 + 10503459 = 10503240 -220 + 10648000 = 10647780 -221 + 10793861 = 10793640 -222 + 10941048 = 10940826 -223 + 11089567 = 11089344 -224 + 11239424 = 11239200 -225 + 11390625 = 11390400 -226 + 11543176 = 11542950 -227 + 11697083 = 11696856 -228 + 11852352 = 11852124 -229 + 12008989 = 12008760 -230 + 12167000 = 12166770 -231 + 12326391 = 12326160 -232 + 12487168 = 12486936 -233 + 12649337 = 12649104 -234 + 12812904 = 12812670 -235 + 12977875 = 12977640 -236 + 13144256 = 13144020 -237 + 13312053 = 13311816 -238 + 13481272 = 13481034 -239 + 13651919 = 13651680 -240 + 13824000 = 13823760 -241 + 13997521 = 13997280 -242 + 14172488 = 14172246 -243 + 14348907 = 14348664 -244 + 14526784 = 14526540 -245 + 14706125 = 14705880 -246 + 14886936 = 14886690 -247 + 15069223 = 15068976 -248 + 15252992 = 15252744 -249 + 15438249 = 15438000 -250 + 15625000 = 15624750 -251 + 15813251 = 15813000 -252 + 16003008 = 16002756 -253 + 16194277 = 16194024 -254 + 16387064 = 16386810 -255 + 16581375 = 16581120 -256 + 16777216 = 16776960 -257 + 16974593 = 16974336 -258 + 17173512 = 17173254 -259 + 17373979 = 17373720 -260 + 17576000 = 17575740 -261 + 17779581 = 17779320 -262 + 17984728 = 17984466 -263 + 18191447 = 18191184 -264 + 18399744 = 18399480 -265 + 18609625 = 18609360 -266 + 18821096 = 18820830 -267 + 19034163 = 19033896 -268 + 19248832 = 19248564 -269 + 19465109 = 19464840 -270 + 19683000 = 19682730 -271 + 19902511 = 19902240 -272 + 20123648 = 20123376 -273 + 20346417 = 20346144 -274 + 20570824 = 20570550 -275 + 20796875 = 20796600 -276 + 21024576 = 21024300 -277 + 21253933 = 21253656 -278 + 21484952 = 21484674 -279 + 21717639 = 21717360 -280 + 21952000 = 21951720 -281 + 22188041 = 22187760 -282 + 22425768 = 22425486 -283 + 22665187 = 22664904 -284 + 22906304 = 22906020 -285 + 23149125 = 23148840 -286 + 23393656 = 23393370 -287 + 23639903 = 23639616 -288 + 23887872 = 23887584 -289 + 24137569 = 24137280 -290 + 24389000 = 24388710 -291 + 24642171 = 24641880 -292 + 24897088 = 24896796 -293 + 25153757 = 25153464 -294 + 25412184 = 25411890 -295 + 25672375 = 25672080 -296 + 25934336 = 25934040 -297 + 26198073 = 26197776 -298 + 26463592 = 26463294 -299 + 26730899 = 26730600 -300 + 27000000 = 26999700 -301 + 27270901 = 27270600 -302 + 27543608 = 27543306 -303 + 27818127 = 27817824 -304 + 28094464 = 28094160 -305 + 28372625 = 28372320 -306 + 28652616 = 28652310 -307 + 28934443 = 28934136 -308 + 29218112 = 29217804 -309 + 29503629 = 29503320 -310 + 29791000 = 29790690 -311 + 30080231 = 30079920 -312 + 30371328 = 30371016 -313 + 30664297 = 30663984 -314 + 30959144 = 30958830 -315 + 31255875 = 31255560 -316 + 31554496 = 31554180 -317 + 31855013 = 31854696 -318 + 32157432 = 32157114 -319 + 32461759 = 32461440 -320 + 32768000 = 32767680 -321 + 33076161 = 33075840 -322 + 33386248 = 33385926 -323 + 33698267 = 33697944 -324 + 34012224 = 34011900 -325 + 34328125 = 34327800 -326 + 34645976 = 34645650 -327 + 34965783 = 34965456 -328 + 35287552 = 35287224 -329 + 35611289 = 35610960 -330 + 35937000 = 35936670 -331 + 36264691 = 36264360 -332 + 36594368 = 36594036 -333 + 36926037 = 36925704 -334 + 37259704 = 37259370 -335 + 37595375 = 37595040 -336 + 37933056 = 37932720 -337 + 38272753 = 38272416 -338 + 38614472 = 38614134 -339 + 38958219 = 38957880 -340 + 39304000 = 39303660 -341 + 39651821 = 39651480 -342 + 40001688 = 40001346 -343 + 40353607 = 40353264 -344 + 40707584 = 40707240 -345 + 41063625 = 41063280 -346 + 41421736 = 41421390 -347 + 41781923 = 41781576 -348 + 42144192 = 42143844 -349 + 42508549 = 42508200 -350 + 42875000 = 42874650 -351 + 43243551 = 43243200 -352 + 43614208 = 43613856 -353 + 43986977 = 43986624 -354 + 44361864 = 44361510 -355 + 44738875 = 44738520 -356 + 45118016 = 45117660 -357 + 45499293 = 45498936 -358 + 45882712 = 45882354 -359 + 46268279 = 46267920 -360 + 46656000 = 46655640 -361 + 47045881 = 47045520 -362 + 47437928 = 47437566 -363 + 47832147 = 47831784 -364 + 48228544 = 48228180 -365 + 48627125 = 48626760 -366 + 49027896 = 49027530 -367 + 49430863 = 49430496 -368 + 49836032 = 49835664 -369 + 50243409 = 50243040 -370 + 50653000 = 50652630 -371 + 51064811 = 51064440 -372 + 51478848 = 51478476 -373 + 51895117 = 51894744 -374 + 52313624 = 52313250 -375 + 52734375 = 52734000 -376 + 53157376 = 53157000 -377 + 53582633 = 53582256 -378 + 54010152 = 54009774 -379 + 54439939 = 54439560 -380 + 54872000 = 54871620 -381 + 55306341 = 55305960 -382 + 55742968 = 55742586 -383 + 56181887 = 56181504 -384 + 56623104 = 56622720 -385 + 57066625 = 57066240 -386 + 57512456 = 57512070 -387 + 57960603 = 57960216 -388 + 58411072 = 58410684 -389 + 58863869 = 58863480 -390 + 59319000 = 59318610 -391 + 59776471 = 59776080 -392 + 60236288 = 60235896 -393 + 60698457 = 60698064 -394 + 61162984 = 61162590 -395 + 61629875 = 61629480 -396 + 62099136 = 62098740 -397 + 62570773 = 62570376 -398 + 63044792 = 63044394 -399 + 63521199 = 63520800 -400 + 64000000 = 63999600 -401 + 64481201 = 64480800 -402 + 64964808 = 64964406 -403 + 65450827 = 65450424 -404 + 65939264 = 65938860 -405 + 66430125 = 66429720 -406 + 66923416 = 66923010 -407 + 67419143 = 67418736 -408 + 67917312 = 67916904 -409 + 68417929 = 68417520 -410 + 68921000 = 68920590 -411 + 69426531 = 69426120 -412 + 69934528 = 69934116 -413 + 70444997 = 70444584 -414 + 70957944 = 70957530 -415 + 71473375 = 71472960 -416 + 71991296 = 71990880 -417 + 72511713 = 72511296 -418 + 73034632 = 73034214 -419 + 73560059 = 73559640 -420 + 74088000 = 74087580 -421 + 74618461 = 74618040 -422 + 75151448 = 75151026 -423 + 75686967 = 75686544 -424 + 76225024 = 76224600 -425 + 76765625 = 76765200 -426 + 77308776 = 77308350 -427 + 77854483 = 77854056 -428 + 78402752 = 78402324 -429 + 78953589 = 78953160 -430 + 79507000 = 79506570 -431 + 80062991 = 80062560 -432 + 80621568 = 80621136 -433 + 81182737 = 81182304 -434 + 81746504 = 81746070 -435 + 82312875 = 82312440 -436 + 82881856 = 82881420 -437 + 83453453 = 83453016 -438 + 84027672 = 84027234 -439 + 84604519 = 84604080 -440 + 85184000 = 85183560 -441 + 85766121 = 85765680 -442 + 86350888 = 86350446 -443 + 86938307 = 86937864 -444 + 87528384 = 87527940 -445 + 88121125 = 88120680 -446 + 88716536 = 88716090 -447 + 89314623 = 89314176 -448 + 89915392 = 89914944 -449 + 90518849 = 90518400 -450 + 91125000 = 91124550 -451 + 91733851 = 91733400 -452 + 92345408 = 92344956 -453 + 92959677 = 92959224 -454 + 93576664 = 93576210 -455 + 94196375 = 94195920 -456 + 94818816 = 94818360 -457 + 95443993 = 95443536 -458 + 96071912 = 96071454 -459 + 96702579 = 96702120 -460 + 97336000 = 97335540 -461 + 97972181 = 97971720 -462 + 98611128 = 98610666 -463 + 99252847 = 99252384 -464 + 99897344 = 99896880 -465 + 100544625 = 100544160 -466 + 101194696 = 101194230 -467 + 101847563 = 101847096 -468 + 102503232 = 102502764 -469 + 103161709 = 103161240 -470 + 103823000 = 103822530 -471 + 104487111 = 104486640 -472 + 105154048 = 105153576 -473 + 105823817 = 105823344 -474 + 106496424 = 106495950 -475 + 107171875 = 107171400 -476 + 107850176 = 107849700 -477 + 108531333 = 108530856 -478 + 109215352 = 109214874 -479 + 109902239 = 109901760 -480 + 110592000 = 110591520 -481 + 111284641 = 111284160 -482 + 111980168 = 111979686 -483 + 112678587 = 112678104 -484 + 113379904 = 113379420 -485 + 114084125 = 114083640 -486 + 114791256 = 114790770 -487 + 115501303 = 115500816 -488 + 116214272 = 116213784 -489 + 116930169 = 116929680 -490 + 117649000 = 117648510 -491 + 118370771 = 118370280 -492 + 119095488 = 119094996 -493 + 119823157 = 119822664 -494 + 120553784 = 120553290 -495 + 121287375 = 121286880 -496 + 122023936 = 122023440 -497 + 122763473 = 122762976 -498 + 123505992 = 123505494 -499 + 124251499 = 124251000 -500 + 125000000 = 124999500 -501 + 125751501 = 125751000 -502 + 126506008 = 126505506 -503 + 127263527 = 127263024 -504 + 128024064 = 128023560 -505 + 128787625 = 128787120 -506 + 129554216 = 129553710 -507 + 130323843 = 130323336 -508 + 131096512 = 131096004 -509 + 131872229 = 131871720 -510 + 132651000 = 132650490 -511 + 133432831 = 133432320
10.2.6. dim 1, grid x, block 256
#include <stdio.h> #include <stdlib.h> #include <math.h> #include <assert.h> #include <cuda.h> #include <cuda_runtime.h> #define N 1000000 #define MAX_ERR 1e-6 __global__ void vector_add(float *out, float *a, float *b, int n) { int tid = blockIdx.y * blockDim.x + threadIdx.x; int stride = blockDim.x; // Handling arbitrary vector size for(int i = tid; i < n; i += stride){ if (i < N){ out[i] = a[i] + b[i]; } } } int main(){ float *a, *b, *out; float *d_a, *d_b, *d_out; // Allocate host memory a = (float*)malloc(sizeof(float) * N); b = (float*)malloc(sizeof(float) * N); out = (float*)malloc(sizeof(float) * N); // Initialize host arrays for(int i = 0; i < N; i++){ a[i] = 1.0f; b[i] = 2.0f; } // Allocate device memory cudaMalloc((void**)&d_a, sizeof(float) * N); cudaMalloc((void**)&d_b, sizeof(float) * N); cudaMalloc((void**)&d_out, sizeof(float) * N); // Transfer data from host to device memory cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice); // Executing kernel int block_size = 256; int grid_size = ((N + block_size) / block_size); vector_add<<<grid_size, block_size>>>(d_out, d_a, d_b, N); // Transfer data back to host memory cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost); // Verification for(int i = 0; i < N; i++){ assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR); } printf("PASSED\n"); // Deallocate device memory cudaFree(d_a); cudaFree(d_b); cudaFree(d_out); // Deallocate host memory free(a); free(b); free(out); }
cd babel
nvcc gpu_vector_add_dim1_gridx_block256.cu -o gpu_vector_add_dim1_gridx_block256
./gpu_vector_add_dim1_gridx_block256
PASSED
10.3. Addition dim 2 arry in GPU
10.3.1. dim 2, grid 1, block NxN for array
#include <stdio.h> #include <cuda.h> #include <assert.h> #include <cuda_runtime.h> #include <curand_kernel.h> #define N 512 __global__ void add(int *a, int *b, int *c){ int tid = threadIdx.x + threadIdx.y*blockDim.x + blockDim.x*blockDim.y*blockIdx.x; if(tid < N) c[tid] = a[tid] + b[tid]; } int main() { int a[N], b[N], c[N], i; int *dev_a, *dev_b, *dev_c; cudaMalloc((void**)&dev_c, N*sizeof(int)); cudaMalloc((void**)&dev_b, N*sizeof(int)); cudaMalloc((void**)&dev_a, N*sizeof(int)); for(i=0; i < N; i++) { a[i] = 1; b[i] = 2; } cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice); dim3 BlockPerGrid(1, 1, 1); dim3 ThreadsPerBlock(128, 4, 1); add <<< BlockPerGrid, ThreadsPerBlock >>>(dev_a, dev_b, dev_c); cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost); for(i=0; i < N; i++){ assert(a[i] + b[i] == c[i]); } printf("Passed\n"); cudaFree(dev_c); cudaFree(dev_b); cudaFree(dev_a); return 0; }
cd babel
nvcc gpu_vector_add_dim2_grid1_blockNxN.cu -o gpu_vector_add_dim2_grid1_blockNxN
./gpu_vector_add_dim2_grid1_blockNxN
Passed
10.3.2. dim 2, grid 1, block NxN for matrix
#include <stdio.h> #include <cuda.h> #include <assert.h> #include <cuda_runtime.h> #include <curand_kernel.h> #define n 32 #define N n*n __global__ void add(float *a, float *b, float *c){ __shared__ float A[n][n]; __shared__ float B[n][n]; __shared__ float C[n][n]; int tidx = threadIdx.x; int tidy = threadIdx.y; A[tidy][tidx] = a[tidx +n*tidy]; B[tidy][tidx] = b[tidx +n*tidy]; C[tidy][tidx]= A[tidy][tidx] + B[tidy][tidx]; c[tidx +n*tidy] = C[tidy][tidx]; } int main() { float a[N], b[N], out[N]; float *d_a, *d_b, *d_out; for(int i = 0; i < n; i++){ for (int j = 0; j < n; j++){ a[i+j*n] = 1.0f + i; b[i+j*n] = 1.0f + i; } } // Allocate device memory cudaMalloc((void**)&d_a, sizeof(float) * N); cudaMalloc((void**)&d_b, sizeof(float) * N); cudaMalloc((void**)&d_out, sizeof(float) * N); // Transfer data from host to device memory cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice); dim3 BlockPerGrid(1, 1, 1); dim3 ThreadsPerBlock(n, n, 1); add <<< BlockPerGrid, ThreadsPerBlock >>>(d_a, d_b, d_out); cudaMemcpy(out, d_out, sizeof(int) * N, cudaMemcpyDeviceToHost); for(int i=0; i < N; i++){ printf("a[%d] + b[%d] == out[%d] is %f + %f = %f\n", i,i,i, a[i], b[i], out[i]); } printf("Passed\n"); cudaFree(d_out); cudaFree(d_b); cudaFree(d_a); return 0; }
cd babel
nvcc gpu_vector_add_dim2_grid1_blockNxN_matrix.cu -o gpu_vector_add_dim2_grid1_blockNxN_matrix
./gpu_vector_add_dim2_grid1_blockNxN_matrix
a[0] + b[0] == out[0] is 1.000000 + 1.000000 = 2.000000 a[1] + b[1] == out[1] is 2.000000 + 2.000000 = 4.000000 a[2] + b[2] == out[2] is 3.000000 + 3.000000 = 6.000000 a[3] + b[3] == out[3] is 4.000000 + 4.000000 = 8.000000 a[4] + b[4] == out[4] is 5.000000 + 5.000000 = 10.000000 a[5] + b[5] == out[5] is 6.000000 + 6.000000 = 12.000000 a[6] + b[6] == out[6] is 7.000000 + 7.000000 = 14.000000 a[7] + b[7] == out[7] is 8.000000 + 8.000000 = 16.000000 a[8] + b[8] == out[8] is 9.000000 + 9.000000 = 18.000000 a[9] + b[9] == out[9] is 10.000000 + 10.000000 = 20.000000 a[10] + b[10] == out[10] is 11.000000 + 11.000000 = 22.000000 a[11] + b[11] == out[11] is 12.000000 + 12.000000 = 24.000000 a[12] + b[12] == out[12] is 13.000000 + 13.000000 = 26.000000 a[13] + b[13] == out[13] is 14.000000 + 14.000000 = 28.000000 a[14] + b[14] == out[14] is 15.000000 + 15.000000 = 30.000000 a[15] + b[15] == out[15] is 16.000000 + 16.000000 = 32.000000 a[16] + b[16] == out[16] is 17.000000 + 17.000000 = 34.000000 a[17] + b[17] == out[17] is 18.000000 + 18.000000 = 36.000000 a[18] + b[18] == out[18] is 19.000000 + 19.000000 = 38.000000 a[19] + b[19] == out[19] is 20.000000 + 20.000000 = 40.000000 a[20] + b[20] == out[20] is 21.000000 + 21.000000 = 42.000000 a[21] + b[21] == out[21] is 22.000000 + 22.000000 = 44.000000 a[22] + b[22] == out[22] is 23.000000 + 23.000000 = 46.000000 a[23] + b[23] == out[23] is 24.000000 + 24.000000 = 48.000000 a[24] + b[24] == out[24] is 25.000000 + 25.000000 = 50.000000 a[25] + b[25] == out[25] is 26.000000 + 26.000000 = 52.000000 a[26] + b[26] == out[26] is 27.000000 + 27.000000 = 54.000000 a[27] + b[27] == out[27] is 28.000000 + 28.000000 = 56.000000 a[28] + b[28] == out[28] is 29.000000 + 29.000000 = 58.000000 a[29] + b[29] == out[29] is 30.000000 + 30.000000 = 60.000000 a[30] + b[30] == out[30] is 31.000000 + 31.000000 = 62.000000 a[31] + b[31] == out[31] is 32.000000 + 32.000000 = 64.000000 a[32] + b[32] == out[32] is 1.000000 + 1.000000 = 2.000000 a[33] + b[33] == out[33] is 2.000000 + 2.000000 = 4.000000 a[34] + b[34] == out[34] is 3.000000 + 3.000000 = 6.000000 a[35] + b[35] == out[35] is 4.000000 + 4.000000 = 8.000000 a[36] + b[36] == out[36] is 5.000000 + 5.000000 = 10.000000 a[37] + b[37] == out[37] is 6.000000 + 6.000000 = 12.000000 a[38] + b[38] == out[38] is 7.000000 + 7.000000 = 14.000000 a[39] + b[39] == out[39] is 8.000000 + 8.000000 = 16.000000 a[40] + b[40] == out[40] is 9.000000 + 9.000000 = 18.000000 a[41] + b[41] == out[41] is 10.000000 + 10.000000 = 20.000000 a[42] + b[42] == out[42] is 11.000000 + 11.000000 = 22.000000 a[43] + b[43] == out[43] is 12.000000 + 12.000000 = 24.000000 a[44] + b[44] == out[44] is 13.000000 + 13.000000 = 26.000000 a[45] + b[45] == out[45] is 14.000000 + 14.000000 = 28.000000 a[46] + b[46] == out[46] is 15.000000 + 15.000000 = 30.000000 a[47] + b[47] == out[47] is 16.000000 + 16.000000 = 32.000000 a[48] + b[48] == out[48] is 17.000000 + 17.000000 = 34.000000 a[49] + b[49] == out[49] is 18.000000 + 18.000000 = 36.000000 a[50] + b[50] == out[50] is 19.000000 + 19.000000 = 38.000000 a[51] + b[51] == out[51] is 20.000000 + 20.000000 = 40.000000 a[52] + b[52] == out[52] is 21.000000 + 21.000000 = 42.000000 a[53] + b[53] == out[53] is 22.000000 + 22.000000 = 44.000000 a[54] + b[54] == out[54] is 23.000000 + 23.000000 = 46.000000 a[55] + b[55] == out[55] is 24.000000 + 24.000000 = 48.000000 a[56] + b[56] == out[56] is 25.000000 + 25.000000 = 50.000000 a[57] + b[57] == out[57] is 26.000000 + 26.000000 = 52.000000 a[58] + b[58] == out[58] is 27.000000 + 27.000000 = 54.000000 a[59] + b[59] == out[59] is 28.000000 + 28.000000 = 56.000000 a[60] + b[60] == out[60] is 29.000000 + 29.000000 = 58.000000 a[61] + b[61] == out[61] is 30.000000 + 30.000000 = 60.000000 a[62] + b[62] == out[62] is 31.000000 + 31.000000 = 62.000000 a[63] + b[63] == out[63] is 32.000000 + 32.000000 = 64.000000 a[64] + b[64] == out[64] is 1.000000 + 1.000000 = 2.000000 a[65] + b[65] == out[65] is 2.000000 + 2.000000 = 4.000000 a[66] + b[66] == out[66] is 3.000000 + 3.000000 = 6.000000 a[67] + b[67] == out[67] is 4.000000 + 4.000000 = 8.000000 a[68] + b[68] == out[68] is 5.000000 + 5.000000 = 10.000000 a[69] + b[69] == out[69] is 6.000000 + 6.000000 = 12.000000 a[70] + b[70] == out[70] is 7.000000 + 7.000000 = 14.000000 a[71] + b[71] == out[71] is 8.000000 + 8.000000 = 16.000000 a[72] + b[72] == out[72] is 9.000000 + 9.000000 = 18.000000 a[73] + b[73] == out[73] is 10.000000 + 10.000000 = 20.000000 a[74] + b[74] == out[74] is 11.000000 + 11.000000 = 22.000000 a[75] + b[75] == out[75] is 12.000000 + 12.000000 = 24.000000 a[76] + b[76] == out[76] is 13.000000 + 13.000000 = 26.000000 a[77] + b[77] == out[77] is 14.000000 + 14.000000 = 28.000000 a[78] + b[78] == out[78] is 15.000000 + 15.000000 = 30.000000 a[79] + b[79] == out[79] is 16.000000 + 16.000000 = 32.000000 a[80] + b[80] == out[80] is 17.000000 + 17.000000 = 34.000000 a[81] + b[81] == out[81] is 18.000000 + 18.000000 = 36.000000 a[82] + b[82] == out[82] is 19.000000 + 19.000000 = 38.000000 a[83] + b[83] == out[83] is 20.000000 + 20.000000 = 40.000000 a[84] + b[84] == out[84] is 21.000000 + 21.000000 = 42.000000 a[85] + b[85] == out[85] is 22.000000 + 22.000000 = 44.000000 a[86] + b[86] == out[86] is 23.000000 + 23.000000 = 46.000000 a[87] + b[87] == out[87] is 24.000000 + 24.000000 = 48.000000 a[88] + b[88] == out[88] is 25.000000 + 25.000000 = 50.000000 a[89] + b[89] == out[89] is 26.000000 + 26.000000 = 52.000000 a[90] + b[90] == out[90] is 27.000000 + 27.000000 = 54.000000 a[91] + b[91] == out[91] is 28.000000 + 28.000000 = 56.000000 a[92] + b[92] == out[92] is 29.000000 + 29.000000 = 58.000000 a[93] + b[93] == out[93] is 30.000000 + 30.000000 = 60.000000 a[94] + b[94] == out[94] is 31.000000 + 31.000000 = 62.000000 a[95] + b[95] == out[95] is 32.000000 + 32.000000 = 64.000000 a[96] + b[96] == out[96] is 1.000000 + 1.000000 = 2.000000 a[97] + b[97] == out[97] is 2.000000 + 2.000000 = 4.000000 a[98] + b[98] == out[98] is 3.000000 + 3.000000 = 6.000000 a[99] + b[99] == out[99] is 4.000000 + 4.000000 = 8.000000 a[100] + b[100] == out[100] is 5.000000 + 5.000000 = 10.000000 a[101] + b[101] == out[101] is 6.000000 + 6.000000 = 12.000000 a[102] + b[102] == out[102] is 7.000000 + 7.000000 = 14.000000 a[103] + b[103] == out[103] is 8.000000 + 8.000000 = 16.000000 a[104] + b[104] == out[104] is 9.000000 + 9.000000 = 18.000000 a[105] + b[105] == out[105] is 10.000000 + 10.000000 = 20.000000 a[106] + b[106] == out[106] is 11.000000 + 11.000000 = 22.000000 a[107] + b[107] == out[107] is 12.000000 + 12.000000 = 24.000000 a[108] + b[108] == out[108] is 13.000000 + 13.000000 = 26.000000 a[109] + b[109] == out[109] is 14.000000 + 14.000000 = 28.000000 a[110] + b[110] == out[110] is 15.000000 + 15.000000 = 30.000000 a[111] + b[111] == out[111] is 16.000000 + 16.000000 = 32.000000 a[112] + b[112] == out[112] is 17.000000 + 17.000000 = 34.000000 a[113] + b[113] == out[113] is 18.000000 + 18.000000 = 36.000000 a[114] + b[114] == out[114] is 19.000000 + 19.000000 = 38.000000 a[115] + b[115] == out[115] is 20.000000 + 20.000000 = 40.000000 a[116] + b[116] == out[116] is 21.000000 + 21.000000 = 42.000000 a[117] + b[117] == out[117] is 22.000000 + 22.000000 = 44.000000 a[118] + b[118] == out[118] is 23.000000 + 23.000000 = 46.000000 a[119] + b[119] == out[119] is 24.000000 + 24.000000 = 48.000000 a[120] + b[120] == out[120] is 25.000000 + 25.000000 = 50.000000 a[121] + b[121] == out[121] is 26.000000 + 26.000000 = 52.000000 a[122] + b[122] == out[122] is 27.000000 + 27.000000 = 54.000000 a[123] + b[123] == out[123] is 28.000000 + 28.000000 = 56.000000 a[124] + b[124] == out[124] is 29.000000 + 29.000000 = 58.000000 a[125] + b[125] == out[125] is 30.000000 + 30.000000 = 60.000000 a[126] + b[126] == out[126] is 31.000000 + 31.000000 = 62.000000 a[127] + b[127] == out[127] is 32.000000 + 32.000000 = 64.000000 a[128] + b[128] == out[128] is 1.000000 + 1.000000 = 2.000000 a[129] + b[129] == out[129] is 2.000000 + 2.000000 = 4.000000 a[130] + b[130] == out[130] is 3.000000 + 3.000000 = 6.000000 a[131] + b[131] == out[131] is 4.000000 + 4.000000 = 8.000000 a[132] + b[132] == out[132] is 5.000000 + 5.000000 = 10.000000 a[133] + b[133] == out[133] is 6.000000 + 6.000000 = 12.000000 a[134] + b[134] == out[134] is 7.000000 + 7.000000 = 14.000000 a[135] + b[135] == out[135] is 8.000000 + 8.000000 = 16.000000 a[136] + b[136] == out[136] is 9.000000 + 9.000000 = 18.000000 a[137] + b[137] == out[137] is 10.000000 + 10.000000 = 20.000000 a[138] + b[138] == out[138] is 11.000000 + 11.000000 = 22.000000 a[139] + b[139] == out[139] is 12.000000 + 12.000000 = 24.000000 a[140] + b[140] == out[140] is 13.000000 + 13.000000 = 26.000000 a[141] + b[141] == out[141] is 14.000000 + 14.000000 = 28.000000 a[142] + b[142] == out[142] is 15.000000 + 15.000000 = 30.000000 a[143] + b[143] == out[143] is 16.000000 + 16.000000 = 32.000000 a[144] + b[144] == out[144] is 17.000000 + 17.000000 = 34.000000 a[145] + b[145] == out[145] is 18.000000 + 18.000000 = 36.000000 a[146] + b[146] == out[146] is 19.000000 + 19.000000 = 38.000000 a[147] + b[147] == out[147] is 20.000000 + 20.000000 = 40.000000 a[148] + b[148] == out[148] is 21.000000 + 21.000000 = 42.000000 a[149] + b[149] == out[149] is 22.000000 + 22.000000 = 44.000000 a[150] + b[150] == out[150] is 23.000000 + 23.000000 = 46.000000 a[151] + b[151] == out[151] is 24.000000 + 24.000000 = 48.000000 a[152] + b[152] == out[152] is 25.000000 + 25.000000 = 50.000000 a[153] + b[153] == out[153] is 26.000000 + 26.000000 = 52.000000 a[154] + b[154] == out[154] is 27.000000 + 27.000000 = 54.000000 a[155] + b[155] == out[155] is 28.000000 + 28.000000 = 56.000000 a[156] + b[156] == out[156] is 29.000000 + 29.000000 = 58.000000 a[157] + b[157] == out[157] is 30.000000 + 30.000000 = 60.000000 a[158] + b[158] == out[158] is 31.000000 + 31.000000 = 62.000000 a[159] + b[159] == out[159] is 32.000000 + 32.000000 = 64.000000 a[160] + b[160] == out[160] is 1.000000 + 1.000000 = 2.000000 a[161] + b[161] == out[161] is 2.000000 + 2.000000 = 4.000000 a[162] + b[162] == out[162] is 3.000000 + 3.000000 = 6.000000 a[163] + b[163] == out[163] is 4.000000 + 4.000000 = 8.000000 a[164] + b[164] == out[164] is 5.000000 + 5.000000 = 10.000000 a[165] + b[165] == out[165] is 6.000000 + 6.000000 = 12.000000 a[166] + b[166] == out[166] is 7.000000 + 7.000000 = 14.000000 a[167] + b[167] == out[167] is 8.000000 + 8.000000 = 16.000000 a[168] + b[168] == out[168] is 9.000000 + 9.000000 = 18.000000 a[169] + b[169] == out[169] is 10.000000 + 10.000000 = 20.000000 a[170] + b[170] == out[170] is 11.000000 + 11.000000 = 22.000000 a[171] + b[171] == out[171] is 12.000000 + 12.000000 = 24.000000 a[172] + b[172] == out[172] is 13.000000 + 13.000000 = 26.000000 a[173] + b[173] == out[173] is 14.000000 + 14.000000 = 28.000000 a[174] + b[174] == out[174] is 15.000000 + 15.000000 = 30.000000 a[175] + b[175] == out[175] is 16.000000 + 16.000000 = 32.000000 a[176] + b[176] == out[176] is 17.000000 + 17.000000 = 34.000000 a[177] + b[177] == out[177] is 18.000000 + 18.000000 = 36.000000 a[178] + b[178] == out[178] is 19.000000 + 19.000000 = 38.000000 a[179] + b[179] == out[179] is 20.000000 + 20.000000 = 40.000000 a[180] + b[180] == out[180] is 21.000000 + 21.000000 = 42.000000 a[181] + b[181] == out[181] is 22.000000 + 22.000000 = 44.000000 a[182] + b[182] == out[182] is 23.000000 + 23.000000 = 46.000000 a[183] + b[183] == out[183] is 24.000000 + 24.000000 = 48.000000 a[184] + b[184] == out[184] is 25.000000 + 25.000000 = 50.000000 a[185] + b[185] == out[185] is 26.000000 + 26.000000 = 52.000000 a[186] + b[186] == out[186] is 27.000000 + 27.000000 = 54.000000 a[187] + b[187] == out[187] is 28.000000 + 28.000000 = 56.000000 a[188] + b[188] == out[188] is 29.000000 + 29.000000 = 58.000000 a[189] + b[189] == out[189] is 30.000000 + 30.000000 = 60.000000 a[190] + b[190] == out[190] is 31.000000 + 31.000000 = 62.000000 a[191] + b[191] == out[191] is 32.000000 + 32.000000 = 64.000000 a[192] + b[192] == out[192] is 1.000000 + 1.000000 = 2.000000 a[193] + b[193] == out[193] is 2.000000 + 2.000000 = 4.000000 a[194] + b[194] == out[194] is 3.000000 + 3.000000 = 6.000000 a[195] + b[195] == out[195] is 4.000000 + 4.000000 = 8.000000 a[196] + b[196] == out[196] is 5.000000 + 5.000000 = 10.000000 a[197] + b[197] == out[197] is 6.000000 + 6.000000 = 12.000000 a[198] + b[198] == out[198] is 7.000000 + 7.000000 = 14.000000 a[199] + b[199] == out[199] is 8.000000 + 8.000000 = 16.000000 a[200] + b[200] == out[200] is 9.000000 + 9.000000 = 18.000000 a[201] + b[201] == out[201] is 10.000000 + 10.000000 = 20.000000 a[202] + b[202] == out[202] is 11.000000 + 11.000000 = 22.000000 a[203] + b[203] == out[203] is 12.000000 + 12.000000 = 24.000000 a[204] + b[204] == out[204] is 13.000000 + 13.000000 = 26.000000 a[205] + b[205] == out[205] is 14.000000 + 14.000000 = 28.000000 a[206] + b[206] == out[206] is 15.000000 + 15.000000 = 30.000000 a[207] + b[207] == out[207] is 16.000000 + 16.000000 = 32.000000 a[208] + b[208] == out[208] is 17.000000 + 17.000000 = 34.000000 a[209] + b[209] == out[209] is 18.000000 + 18.000000 = 36.000000 a[210] + b[210] == out[210] is 19.000000 + 19.000000 = 38.000000 a[211] + b[211] == out[211] is 20.000000 + 20.000000 = 40.000000 a[212] + b[212] == out[212] is 21.000000 + 21.000000 = 42.000000 a[213] + b[213] == out[213] is 22.000000 + 22.000000 = 44.000000 a[214] + b[214] == out[214] is 23.000000 + 23.000000 = 46.000000 a[215] + b[215] == out[215] is 24.000000 + 24.000000 = 48.000000 a[216] + b[216] == out[216] is 25.000000 + 25.000000 = 50.000000 a[217] + b[217] == out[217] is 26.000000 + 26.000000 = 52.000000 a[218] + b[218] == out[218] is 27.000000 + 27.000000 = 54.000000 a[219] + b[219] == out[219] is 28.000000 + 28.000000 = 56.000000 a[220] + b[220] == out[220] is 29.000000 + 29.000000 = 58.000000 a[221] + b[221] == out[221] is 30.000000 + 30.000000 = 60.000000 a[222] + b[222] == out[222] is 31.000000 + 31.000000 = 62.000000 a[223] + b[223] == out[223] is 32.000000 + 32.000000 = 64.000000 a[224] + b[224] == out[224] is 1.000000 + 1.000000 = 2.000000 a[225] + b[225] == out[225] is 2.000000 + 2.000000 = 4.000000 a[226] + b[226] == out[226] is 3.000000 + 3.000000 = 6.000000 a[227] + b[227] == out[227] is 4.000000 + 4.000000 = 8.000000 a[228] + b[228] == out[228] is 5.000000 + 5.000000 = 10.000000 a[229] + b[229] == out[229] is 6.000000 + 6.000000 = 12.000000 a[230] + b[230] == out[230] is 7.000000 + 7.000000 = 14.000000 a[231] + b[231] == out[231] is 8.000000 + 8.000000 = 16.000000 a[232] + b[232] == out[232] is 9.000000 + 9.000000 = 18.000000 a[233] + b[233] == out[233] is 10.000000 + 10.000000 = 20.000000 a[234] + b[234] == out[234] is 11.000000 + 11.000000 = 22.000000 a[235] + b[235] == out[235] is 12.000000 + 12.000000 = 24.000000 a[236] + b[236] == out[236] is 13.000000 + 13.000000 = 26.000000 a[237] + b[237] == out[237] is 14.000000 + 14.000000 = 28.000000 a[238] + b[238] == out[238] is 15.000000 + 15.000000 = 30.000000 a[239] + b[239] == out[239] is 16.000000 + 16.000000 = 32.000000 a[240] + b[240] == out[240] is 17.000000 + 17.000000 = 34.000000 a[241] + b[241] == out[241] is 18.000000 + 18.000000 = 36.000000 a[242] + b[242] == out[242] is 19.000000 + 19.000000 = 38.000000 a[243] + b[243] == out[243] is 20.000000 + 20.000000 = 40.000000 a[244] + b[244] == out[244] is 21.000000 + 21.000000 = 42.000000 a[245] + b[245] == out[245] is 22.000000 + 22.000000 = 44.000000 a[246] + b[246] == out[246] is 23.000000 + 23.000000 = 46.000000 a[247] + b[247] == out[247] is 24.000000 + 24.000000 = 48.000000 a[248] + b[248] == out[248] is 25.000000 + 25.000000 = 50.000000 a[249] + b[249] == out[249] is 26.000000 + 26.000000 = 52.000000 a[250] + b[250] == out[250] is 27.000000 + 27.000000 = 54.000000 a[251] + b[251] == out[251] is 28.000000 + 28.000000 = 56.000000 a[252] + b[252] == out[252] is 29.000000 + 29.000000 = 58.000000 a[253] + b[253] == out[253] is 30.000000 + 30.000000 = 60.000000 a[254] + b[254] == out[254] is 31.000000 + 31.000000 = 62.000000 a[255] + b[255] == out[255] is 32.000000 + 32.000000 = 64.000000 a[256] + b[256] == out[256] is 1.000000 + 1.000000 = 2.000000 a[257] + b[257] == out[257] is 2.000000 + 2.000000 = 4.000000 a[258] + b[258] == out[258] is 3.000000 + 3.000000 = 6.000000 a[259] + b[259] == out[259] is 4.000000 + 4.000000 = 8.000000 a[260] + b[260] == out[260] is 5.000000 + 5.000000 = 10.000000 a[261] + b[261] == out[261] is 6.000000 + 6.000000 = 12.000000 a[262] + b[262] == out[262] is 7.000000 + 7.000000 = 14.000000 a[263] + b[263] == out[263] is 8.000000 + 8.000000 = 16.000000 a[264] + b[264] == out[264] is 9.000000 + 9.000000 = 18.000000 a[265] + b[265] == out[265] is 10.000000 + 10.000000 = 20.000000 a[266] + b[266] == out[266] is 11.000000 + 11.000000 = 22.000000 a[267] + b[267] == out[267] is 12.000000 + 12.000000 = 24.000000 a[268] + b[268] == out[268] is 13.000000 + 13.000000 = 26.000000 a[269] + b[269] == out[269] is 14.000000 + 14.000000 = 28.000000 a[270] + b[270] == out[270] is 15.000000 + 15.000000 = 30.000000 a[271] + b[271] == out[271] is 16.000000 + 16.000000 = 32.000000 a[272] + b[272] == out[272] is 17.000000 + 17.000000 = 34.000000 a[273] + b[273] == out[273] is 18.000000 + 18.000000 = 36.000000 a[274] + b[274] == out[274] is 19.000000 + 19.000000 = 38.000000 a[275] + b[275] == out[275] is 20.000000 + 20.000000 = 40.000000 a[276] + b[276] == out[276] is 21.000000 + 21.000000 = 42.000000 a[277] + b[277] == out[277] is 22.000000 + 22.000000 = 44.000000 a[278] + b[278] == out[278] is 23.000000 + 23.000000 = 46.000000 a[279] + b[279] == out[279] is 24.000000 + 24.000000 = 48.000000 a[280] + b[280] == out[280] is 25.000000 + 25.000000 = 50.000000 a[281] + b[281] == out[281] is 26.000000 + 26.000000 = 52.000000 a[282] + b[282] == out[282] is 27.000000 + 27.000000 = 54.000000 a[283] + b[283] == out[283] is 28.000000 + 28.000000 = 56.000000 a[284] + b[284] == out[284] is 29.000000 + 29.000000 = 58.000000 a[285] + b[285] == out[285] is 30.000000 + 30.000000 = 60.000000 a[286] + b[286] == out[286] is 31.000000 + 31.000000 = 62.000000 a[287] + b[287] == out[287] is 32.000000 + 32.000000 = 64.000000 a[288] + b[288] == out[288] is 1.000000 + 1.000000 = 2.000000 a[289] + b[289] == out[289] is 2.000000 + 2.000000 = 4.000000 a[290] + b[290] == out[290] is 3.000000 + 3.000000 = 6.000000 a[291] + b[291] == out[291] is 4.000000 + 4.000000 = 8.000000 a[292] + b[292] == out[292] is 5.000000 + 5.000000 = 10.000000 a[293] + b[293] == out[293] is 6.000000 + 6.000000 = 12.000000 a[294] + b[294] == out[294] is 7.000000 + 7.000000 = 14.000000 a[295] + b[295] == out[295] is 8.000000 + 8.000000 = 16.000000 a[296] + b[296] == out[296] is 9.000000 + 9.000000 = 18.000000 a[297] + b[297] == out[297] is 10.000000 + 10.000000 = 20.000000 a[298] + b[298] == out[298] is 11.000000 + 11.000000 = 22.000000 a[299] + b[299] == out[299] is 12.000000 + 12.000000 = 24.000000 a[300] + b[300] == out[300] is 13.000000 + 13.000000 = 26.000000 a[301] + b[301] == out[301] is 14.000000 + 14.000000 = 28.000000 a[302] + b[302] == out[302] is 15.000000 + 15.000000 = 30.000000 a[303] + b[303] == out[303] is 16.000000 + 16.000000 = 32.000000 a[304] + b[304] == out[304] is 17.000000 + 17.000000 = 34.000000 a[305] + b[305] == out[305] is 18.000000 + 18.000000 = 36.000000 a[306] + b[306] == out[306] is 19.000000 + 19.000000 = 38.000000 a[307] + b[307] == out[307] is 20.000000 + 20.000000 = 40.000000 a[308] + b[308] == out[308] is 21.000000 + 21.000000 = 42.000000 a[309] + b[309] == out[309] is 22.000000 + 22.000000 = 44.000000 a[310] + b[310] == out[310] is 23.000000 + 23.000000 = 46.000000 a[311] + b[311] == out[311] is 24.000000 + 24.000000 = 48.000000 a[312] + b[312] == out[312] is 25.000000 + 25.000000 = 50.000000 a[313] + b[313] == out[313] is 26.000000 + 26.000000 = 52.000000 a[314] + b[314] == out[314] is 27.000000 + 27.000000 = 54.000000 a[315] + b[315] == out[315] is 28.000000 + 28.000000 = 56.000000 a[316] + b[316] == out[316] is 29.000000 + 29.000000 = 58.000000 a[317] + b[317] == out[317] is 30.000000 + 30.000000 = 60.000000 a[318] + b[318] == out[318] is 31.000000 + 31.000000 = 62.000000 a[319] + b[319] == out[319] is 32.000000 + 32.000000 = 64.000000 a[320] + b[320] == out[320] is 1.000000 + 1.000000 = 2.000000 a[321] + b[321] == out[321] is 2.000000 + 2.000000 = 4.000000 a[322] + b[322] == out[322] is 3.000000 + 3.000000 = 6.000000 a[323] + b[323] == out[323] is 4.000000 + 4.000000 = 8.000000 a[324] + b[324] == out[324] is 5.000000 + 5.000000 = 10.000000 a[325] + b[325] == out[325] is 6.000000 + 6.000000 = 12.000000 a[326] + b[326] == out[326] is 7.000000 + 7.000000 = 14.000000 a[327] + b[327] == out[327] is 8.000000 + 8.000000 = 16.000000 a[328] + b[328] == out[328] is 9.000000 + 9.000000 = 18.000000 a[329] + b[329] == out[329] is 10.000000 + 10.000000 = 20.000000 a[330] + b[330] == out[330] is 11.000000 + 11.000000 = 22.000000 a[331] + b[331] == out[331] is 12.000000 + 12.000000 = 24.000000 a[332] + b[332] == out[332] is 13.000000 + 13.000000 = 26.000000 a[333] + b[333] == out[333] is 14.000000 + 14.000000 = 28.000000 a[334] + b[334] == out[334] is 15.000000 + 15.000000 = 30.000000 a[335] + b[335] == out[335] is 16.000000 + 16.000000 = 32.000000 a[336] + b[336] == out[336] is 17.000000 + 17.000000 = 34.000000 a[337] + b[337] == out[337] is 18.000000 + 18.000000 = 36.000000 a[338] + b[338] == out[338] is 19.000000 + 19.000000 = 38.000000 a[339] + b[339] == out[339] is 20.000000 + 20.000000 = 40.000000 a[340] + b[340] == out[340] is 21.000000 + 21.000000 = 42.000000 a[341] + b[341] == out[341] is 22.000000 + 22.000000 = 44.000000 a[342] + b[342] == out[342] is 23.000000 + 23.000000 = 46.000000 a[343] + b[343] == out[343] is 24.000000 + 24.000000 = 48.000000 a[344] + b[344] == out[344] is 25.000000 + 25.000000 = 50.000000 a[345] + b[345] == out[345] is 26.000000 + 26.000000 = 52.000000 a[346] + b[346] == out[346] is 27.000000 + 27.000000 = 54.000000 a[347] + b[347] == out[347] is 28.000000 + 28.000000 = 56.000000 a[348] + b[348] == out[348] is 29.000000 + 29.000000 = 58.000000 a[349] + b[349] == out[349] is 30.000000 + 30.000000 = 60.000000 a[350] + b[350] == out[350] is 31.000000 + 31.000000 = 62.000000 a[351] + b[351] == out[351] is 32.000000 + 32.000000 = 64.000000 a[352] + b[352] == out[352] is 1.000000 + 1.000000 = 2.000000 a[353] + b[353] == out[353] is 2.000000 + 2.000000 = 4.000000 a[354] + b[354] == out[354] is 3.000000 + 3.000000 = 6.000000 a[355] + b[355] == out[355] is 4.000000 + 4.000000 = 8.000000 a[356] + b[356] == out[356] is 5.000000 + 5.000000 = 10.000000 a[357] + b[357] == out[357] is 6.000000 + 6.000000 = 12.000000 a[358] + b[358] == out[358] is 7.000000 + 7.000000 = 14.000000 a[359] + b[359] == out[359] is 8.000000 + 8.000000 = 16.000000 a[360] + b[360] == out[360] is 9.000000 + 9.000000 = 18.000000 a[361] + b[361] == out[361] is 10.000000 + 10.000000 = 20.000000 a[362] + b[362] == out[362] is 11.000000 + 11.000000 = 22.000000 a[363] + b[363] == out[363] is 12.000000 + 12.000000 = 24.000000 a[364] + b[364] == out[364] is 13.000000 + 13.000000 = 26.000000 a[365] + b[365] == out[365] is 14.000000 + 14.000000 = 28.000000 a[366] + b[366] == out[366] is 15.000000 + 15.000000 = 30.000000 a[367] + b[367] == out[367] is 16.000000 + 16.000000 = 32.000000 a[368] + b[368] == out[368] is 17.000000 + 17.000000 = 34.000000 a[369] + b[369] == out[369] is 18.000000 + 18.000000 = 36.000000 a[370] + b[370] == out[370] is 19.000000 + 19.000000 = 38.000000 a[371] + b[371] == out[371] is 20.000000 + 20.000000 = 40.000000 a[372] + b[372] == out[372] is 21.000000 + 21.000000 = 42.000000 a[373] + b[373] == out[373] is 22.000000 + 22.000000 = 44.000000 a[374] + b[374] == out[374] is 23.000000 + 23.000000 = 46.000000 a[375] + b[375] == out[375] is 24.000000 + 24.000000 = 48.000000 a[376] + b[376] == out[376] is 25.000000 + 25.000000 = 50.000000 a[377] + b[377] == out[377] is 26.000000 + 26.000000 = 52.000000 a[378] + b[378] == out[378] is 27.000000 + 27.000000 = 54.000000 a[379] + b[379] == out[379] is 28.000000 + 28.000000 = 56.000000 a[380] + b[380] == out[380] is 29.000000 + 29.000000 = 58.000000 a[381] + b[381] == out[381] is 30.000000 + 30.000000 = 60.000000 a[382] + b[382] == out[382] is 31.000000 + 31.000000 = 62.000000 a[383] + b[383] == out[383] is 32.000000 + 32.000000 = 64.000000 a[384] + b[384] == out[384] is 1.000000 + 1.000000 = 2.000000 a[385] + b[385] == out[385] is 2.000000 + 2.000000 = 4.000000 a[386] + b[386] == out[386] is 3.000000 + 3.000000 = 6.000000 a[387] + b[387] == out[387] is 4.000000 + 4.000000 = 8.000000 a[388] + b[388] == out[388] is 5.000000 + 5.000000 = 10.000000 a[389] + b[389] == out[389] is 6.000000 + 6.000000 = 12.000000 a[390] + b[390] == out[390] is 7.000000 + 7.000000 = 14.000000 a[391] + b[391] == out[391] is 8.000000 + 8.000000 = 16.000000 a[392] + b[392] == out[392] is 9.000000 + 9.000000 = 18.000000 a[393] + b[393] == out[393] is 10.000000 + 10.000000 = 20.000000 a[394] + b[394] == out[394] is 11.000000 + 11.000000 = 22.000000 a[395] + b[395] == out[395] is 12.000000 + 12.000000 = 24.000000 a[396] + b[396] == out[396] is 13.000000 + 13.000000 = 26.000000 a[397] + b[397] == out[397] is 14.000000 + 14.000000 = 28.000000 a[398] + b[398] == out[398] is 15.000000 + 15.000000 = 30.000000 a[399] + b[399] == out[399] is 16.000000 + 16.000000 = 32.000000 a[400] + b[400] == out[400] is 17.000000 + 17.000000 = 34.000000 a[401] + b[401] == out[401] is 18.000000 + 18.000000 = 36.000000 a[402] + b[402] == out[402] is 19.000000 + 19.000000 = 38.000000 a[403] + b[403] == out[403] is 20.000000 + 20.000000 = 40.000000 a[404] + b[404] == out[404] is 21.000000 + 21.000000 = 42.000000 a[405] + b[405] == out[405] is 22.000000 + 22.000000 = 44.000000 a[406] + b[406] == out[406] is 23.000000 + 23.000000 = 46.000000 a[407] + b[407] == out[407] is 24.000000 + 24.000000 = 48.000000 a[408] + b[408] == out[408] is 25.000000 + 25.000000 = 50.000000 a[409] + b[409] == out[409] is 26.000000 + 26.000000 = 52.000000 a[410] + b[410] == out[410] is 27.000000 + 27.000000 = 54.000000 a[411] + b[411] == out[411] is 28.000000 + 28.000000 = 56.000000 a[412] + b[412] == out[412] is 29.000000 + 29.000000 = 58.000000 a[413] + b[413] == out[413] is 30.000000 + 30.000000 = 60.000000 a[414] + b[414] == out[414] is 31.000000 + 31.000000 = 62.000000 a[415] + b[415] == out[415] is 32.000000 + 32.000000 = 64.000000 a[416] + b[416] == out[416] is 1.000000 + 1.000000 = 2.000000 a[417] + b[417] == out[417] is 2.000000 + 2.000000 = 4.000000 a[418] + b[418] == out[418] is 3.000000 + 3.000000 = 6.000000 a[419] + b[419] == out[419] is 4.000000 + 4.000000 = 8.000000 a[420] + b[420] == out[420] is 5.000000 + 5.000000 = 10.000000 a[421] + b[421] == out[421] is 6.000000 + 6.000000 = 12.000000 a[422] + b[422] == out[422] is 7.000000 + 7.000000 = 14.000000 a[423] + b[423] == out[423] is 8.000000 + 8.000000 = 16.000000 a[424] + b[424] == out[424] is 9.000000 + 9.000000 = 18.000000 a[425] + b[425] == out[425] is 10.000000 + 10.000000 = 20.000000 a[426] + b[426] == out[426] is 11.000000 + 11.000000 = 22.000000 a[427] + b[427] == out[427] is 12.000000 + 12.000000 = 24.000000 a[428] + b[428] == out[428] is 13.000000 + 13.000000 = 26.000000 a[429] + b[429] == out[429] is 14.000000 + 14.000000 = 28.000000 a[430] + b[430] == out[430] is 15.000000 + 15.000000 = 30.000000 a[431] + b[431] == out[431] is 16.000000 + 16.000000 = 32.000000 a[432] + b[432] == out[432] is 17.000000 + 17.000000 = 34.000000 a[433] + b[433] == out[433] is 18.000000 + 18.000000 = 36.000000 a[434] + b[434] == out[434] is 19.000000 + 19.000000 = 38.000000 a[435] + b[435] == out[435] is 20.000000 + 20.000000 = 40.000000 a[436] + b[436] == out[436] is 21.000000 + 21.000000 = 42.000000 a[437] + b[437] == out[437] is 22.000000 + 22.000000 = 44.000000 a[438] + b[438] == out[438] is 23.000000 + 23.000000 = 46.000000 a[439] + b[439] == out[439] is 24.000000 + 24.000000 = 48.000000 a[440] + b[440] == out[440] is 25.000000 + 25.000000 = 50.000000 a[441] + b[441] == out[441] is 26.000000 + 26.000000 = 52.000000 a[442] + b[442] == out[442] is 27.000000 + 27.000000 = 54.000000 a[443] + b[443] == out[443] is 28.000000 + 28.000000 = 56.000000 a[444] + b[444] == out[444] is 29.000000 + 29.000000 = 58.000000 a[445] + b[445] == out[445] is 30.000000 + 30.000000 = 60.000000 a[446] + b[446] == out[446] is 31.000000 + 31.000000 = 62.000000 a[447] + b[447] == out[447] is 32.000000 + 32.000000 = 64.000000 a[448] + b[448] == out[448] is 1.000000 + 1.000000 = 2.000000 a[449] + b[449] == out[449] is 2.000000 + 2.000000 = 4.000000 a[450] + b[450] == out[450] is 3.000000 + 3.000000 = 6.000000 a[451] + b[451] == out[451] is 4.000000 + 4.000000 = 8.000000 a[452] + b[452] == out[452] is 5.000000 + 5.000000 = 10.000000 a[453] + b[453] == out[453] is 6.000000 + 6.000000 = 12.000000 a[454] + b[454] == out[454] is 7.000000 + 7.000000 = 14.000000 a[455] + b[455] == out[455] is 8.000000 + 8.000000 = 16.000000 a[456] + b[456] == out[456] is 9.000000 + 9.000000 = 18.000000 a[457] + b[457] == out[457] is 10.000000 + 10.000000 = 20.000000 a[458] + b[458] == out[458] is 11.000000 + 11.000000 = 22.000000 a[459] + b[459] == out[459] is 12.000000 + 12.000000 = 24.000000 a[460] + b[460] == out[460] is 13.000000 + 13.000000 = 26.000000 a[461] + b[461] == out[461] is 14.000000 + 14.000000 = 28.000000 a[462] + b[462] == out[462] is 15.000000 + 15.000000 = 30.000000 a[463] + b[463] == out[463] is 16.000000 + 16.000000 = 32.000000 a[464] + b[464] == out[464] is 17.000000 + 17.000000 = 34.000000 a[465] + b[465] == out[465] is 18.000000 + 18.000000 = 36.000000 a[466] + b[466] == out[466] is 19.000000 + 19.000000 = 38.000000 a[467] + b[467] == out[467] is 20.000000 + 20.000000 = 40.000000 a[468] + b[468] == out[468] is 21.000000 + 21.000000 = 42.000000 a[469] + b[469] == out[469] is 22.000000 + 22.000000 = 44.000000 a[470] + b[470] == out[470] is 23.000000 + 23.000000 = 46.000000 a[471] + b[471] == out[471] is 24.000000 + 24.000000 = 48.000000 a[472] + b[472] == out[472] is 25.000000 + 25.000000 = 50.000000 a[473] + b[473] == out[473] is 26.000000 + 26.000000 = 52.000000 a[474] + b[474] == out[474] is 27.000000 + 27.000000 = 54.000000 a[475] + b[475] == out[475] is 28.000000 + 28.000000 = 56.000000 a[476] + b[476] == out[476] is 29.000000 + 29.000000 = 58.000000 a[477] + b[477] == out[477] is 30.000000 + 30.000000 = 60.000000 a[478] + b[478] == out[478] is 31.000000 + 31.000000 = 62.000000 a[479] + b[479] == out[479] is 32.000000 + 32.000000 = 64.000000 a[480] + b[480] == out[480] is 1.000000 + 1.000000 = 2.000000 a[481] + b[481] == out[481] is 2.000000 + 2.000000 = 4.000000 a[482] + b[482] == out[482] is 3.000000 + 3.000000 = 6.000000 a[483] + b[483] == out[483] is 4.000000 + 4.000000 = 8.000000 a[484] + b[484] == out[484] is 5.000000 + 5.000000 = 10.000000 a[485] + b[485] == out[485] is 6.000000 + 6.000000 = 12.000000 a[486] + b[486] == out[486] is 7.000000 + 7.000000 = 14.000000 a[487] + b[487] == out[487] is 8.000000 + 8.000000 = 16.000000 a[488] + b[488] == out[488] is 9.000000 + 9.000000 = 18.000000 a[489] + b[489] == out[489] is 10.000000 + 10.000000 = 20.000000 a[490] + b[490] == out[490] is 11.000000 + 11.000000 = 22.000000 a[491] + b[491] == out[491] is 12.000000 + 12.000000 = 24.000000 a[492] + b[492] == out[492] is 13.000000 + 13.000000 = 26.000000 a[493] + b[493] == out[493] is 14.000000 + 14.000000 = 28.000000 a[494] + b[494] == out[494] is 15.000000 + 15.000000 = 30.000000 a[495] + b[495] == out[495] is 16.000000 + 16.000000 = 32.000000 a[496] + b[496] == out[496] is 17.000000 + 17.000000 = 34.000000 a[497] + b[497] == out[497] is 18.000000 + 18.000000 = 36.000000 a[498] + b[498] == out[498] is 19.000000 + 19.000000 = 38.000000 a[499] + b[499] == out[499] is 20.000000 + 20.000000 = 40.000000 a[500] + b[500] == out[500] is 21.000000 + 21.000000 = 42.000000 a[501] + b[501] == out[501] is 22.000000 + 22.000000 = 44.000000 a[502] + b[502] == out[502] is 23.000000 + 23.000000 = 46.000000 a[503] + b[503] == out[503] is 24.000000 + 24.000000 = 48.000000 a[504] + b[504] == out[504] is 25.000000 + 25.000000 = 50.000000 a[505] + b[505] == out[505] is 26.000000 + 26.000000 = 52.000000 a[506] + b[506] == out[506] is 27.000000 + 27.000000 = 54.000000 a[507] + b[507] == out[507] is 28.000000 + 28.000000 = 56.000000 a[508] + b[508] == out[508] is 29.000000 + 29.000000 = 58.000000 a[509] + b[509] == out[509] is 30.000000 + 30.000000 = 60.000000 a[510] + b[510] == out[510] is 31.000000 + 31.000000 = 62.000000 a[511] + b[511] == out[511] is 32.000000 + 32.000000 = 64.000000 a[512] + b[512] == out[512] is 1.000000 + 1.000000 = 2.000000 a[513] + b[513] == out[513] is 2.000000 + 2.000000 = 4.000000 a[514] + b[514] == out[514] is 3.000000 + 3.000000 = 6.000000 a[515] + b[515] == out[515] is 4.000000 + 4.000000 = 8.000000 a[516] + b[516] == out[516] is 5.000000 + 5.000000 = 10.000000 a[517] + b[517] == out[517] is 6.000000 + 6.000000 = 12.000000 a[518] + b[518] == out[518] is 7.000000 + 7.000000 = 14.000000 a[519] + b[519] == out[519] is 8.000000 + 8.000000 = 16.000000 a[520] + b[520] == out[520] is 9.000000 + 9.000000 = 18.000000 a[521] + b[521] == out[521] is 10.000000 + 10.000000 = 20.000000 a[522] + b[522] == out[522] is 11.000000 + 11.000000 = 22.000000 a[523] + b[523] == out[523] is 12.000000 + 12.000000 = 24.000000 a[524] + b[524] == out[524] is 13.000000 + 13.000000 = 26.000000 a[525] + b[525] == out[525] is 14.000000 + 14.000000 = 28.000000 a[526] + b[526] == out[526] is 15.000000 + 15.000000 = 30.000000 a[527] + b[527] == out[527] is 16.000000 + 16.000000 = 32.000000 a[528] + b[528] == out[528] is 17.000000 + 17.000000 = 34.000000 a[529] + b[529] == out[529] is 18.000000 + 18.000000 = 36.000000 a[530] + b[530] == out[530] is 19.000000 + 19.000000 = 38.000000 a[531] + b[531] == out[531] is 20.000000 + 20.000000 = 40.000000 a[532] + b[532] == out[532] is 21.000000 + 21.000000 = 42.000000 a[533] + b[533] == out[533] is 22.000000 + 22.000000 = 44.000000 a[534] + b[534] == out[534] is 23.000000 + 23.000000 = 46.000000 a[535] + b[535] == out[535] is 24.000000 + 24.000000 = 48.000000 a[536] + b[536] == out[536] is 25.000000 + 25.000000 = 50.000000 a[537] + b[537] == out[537] is 26.000000 + 26.000000 = 52.000000 a[538] + b[538] == out[538] is 27.000000 + 27.000000 = 54.000000 a[539] + b[539] == out[539] is 28.000000 + 28.000000 = 56.000000 a[540] + b[540] == out[540] is 29.000000 + 29.000000 = 58.000000 a[541] + b[541] == out[541] is 30.000000 + 30.000000 = 60.000000 a[542] + b[542] == out[542] is 31.000000 + 31.000000 = 62.000000 a[543] + b[543] == out[543] is 32.000000 + 32.000000 = 64.000000 a[544] + b[544] == out[544] is 1.000000 + 1.000000 = 2.000000 a[545] + b[545] == out[545] is 2.000000 + 2.000000 = 4.000000 a[546] + b[546] == out[546] is 3.000000 + 3.000000 = 6.000000 a[547] + b[547] == out[547] is 4.000000 + 4.000000 = 8.000000 a[548] + b[548] == out[548] is 5.000000 + 5.000000 = 10.000000 a[549] + b[549] == out[549] is 6.000000 + 6.000000 = 12.000000 a[550] + b[550] == out[550] is 7.000000 + 7.000000 = 14.000000 a[551] + b[551] == out[551] is 8.000000 + 8.000000 = 16.000000 a[552] + b[552] == out[552] is 9.000000 + 9.000000 = 18.000000 a[553] + b[553] == out[553] is 10.000000 + 10.000000 = 20.000000 a[554] + b[554] == out[554] is 11.000000 + 11.000000 = 22.000000 a[555] + b[555] == out[555] is 12.000000 + 12.000000 = 24.000000 a[556] + b[556] == out[556] is 13.000000 + 13.000000 = 26.000000 a[557] + b[557] == out[557] is 14.000000 + 14.000000 = 28.000000 a[558] + b[558] == out[558] is 15.000000 + 15.000000 = 30.000000 a[559] + b[559] == out[559] is 16.000000 + 16.000000 = 32.000000 a[560] + b[560] == out[560] is 17.000000 + 17.000000 = 34.000000 a[561] + b[561] == out[561] is 18.000000 + 18.000000 = 36.000000 a[562] + b[562] == out[562] is 19.000000 + 19.000000 = 38.000000 a[563] + b[563] == out[563] is 20.000000 + 20.000000 = 40.000000 a[564] + b[564] == out[564] is 21.000000 + 21.000000 = 42.000000 a[565] + b[565] == out[565] is 22.000000 + 22.000000 = 44.000000 a[566] + b[566] == out[566] is 23.000000 + 23.000000 = 46.000000 a[567] + b[567] == out[567] is 24.000000 + 24.000000 = 48.000000 a[568] + b[568] == out[568] is 25.000000 + 25.000000 = 50.000000 a[569] + b[569] == out[569] is 26.000000 + 26.000000 = 52.000000 a[570] + b[570] == out[570] is 27.000000 + 27.000000 = 54.000000 a[571] + b[571] == out[571] is 28.000000 + 28.000000 = 56.000000 a[572] + b[572] == out[572] is 29.000000 + 29.000000 = 58.000000 a[573] + b[573] == out[573] is 30.000000 + 30.000000 = 60.000000 a[574] + b[574] == out[574] is 31.000000 + 31.000000 = 62.000000 a[575] + b[575] == out[575] is 32.000000 + 32.000000 = 64.000000 a[576] + b[576] == out[576] is 1.000000 + 1.000000 = 2.000000 a[577] + b[577] == out[577] is 2.000000 + 2.000000 = 4.000000 a[578] + b[578] == out[578] is 3.000000 + 3.000000 = 6.000000 a[579] + b[579] == out[579] is 4.000000 + 4.000000 = 8.000000 a[580] + b[580] == out[580] is 5.000000 + 5.000000 = 10.000000 a[581] + b[581] == out[581] is 6.000000 + 6.000000 = 12.000000 a[582] + b[582] == out[582] is 7.000000 + 7.000000 = 14.000000 a[583] + b[583] == out[583] is 8.000000 + 8.000000 = 16.000000 a[584] + b[584] == out[584] is 9.000000 + 9.000000 = 18.000000 a[585] + b[585] == out[585] is 10.000000 + 10.000000 = 20.000000 a[586] + b[586] == out[586] is 11.000000 + 11.000000 = 22.000000 a[587] + b[587] == out[587] is 12.000000 + 12.000000 = 24.000000 a[588] + b[588] == out[588] is 13.000000 + 13.000000 = 26.000000 a[589] + b[589] == out[589] is 14.000000 + 14.000000 = 28.000000 a[590] + b[590] == out[590] is 15.000000 + 15.000000 = 30.000000 a[591] + b[591] == out[591] is 16.000000 + 16.000000 = 32.000000 a[592] + b[592] == out[592] is 17.000000 + 17.000000 = 34.000000 a[593] + b[593] == out[593] is 18.000000 + 18.000000 = 36.000000 a[594] + b[594] == out[594] is 19.000000 + 19.000000 = 38.000000 a[595] + b[595] == out[595] is 20.000000 + 20.000000 = 40.000000 a[596] + b[596] == out[596] is 21.000000 + 21.000000 = 42.000000 a[597] + b[597] == out[597] is 22.000000 + 22.000000 = 44.000000 a[598] + b[598] == out[598] is 23.000000 + 23.000000 = 46.000000 a[599] + b[599] == out[599] is 24.000000 + 24.000000 = 48.000000 a[600] + b[600] == out[600] is 25.000000 + 25.000000 = 50.000000 a[601] + b[601] == out[601] is 26.000000 + 26.000000 = 52.000000 a[602] + b[602] == out[602] is 27.000000 + 27.000000 = 54.000000 a[603] + b[603] == out[603] is 28.000000 + 28.000000 = 56.000000 a[604] + b[604] == out[604] is 29.000000 + 29.000000 = 58.000000 a[605] + b[605] == out[605] is 30.000000 + 30.000000 = 60.000000 a[606] + b[606] == out[606] is 31.000000 + 31.000000 = 62.000000 a[607] + b[607] == out[607] is 32.000000 + 32.000000 = 64.000000 a[608] + b[608] == out[608] is 1.000000 + 1.000000 = 2.000000 a[609] + b[609] == out[609] is 2.000000 + 2.000000 = 4.000000 a[610] + b[610] == out[610] is 3.000000 + 3.000000 = 6.000000 a[611] + b[611] == out[611] is 4.000000 + 4.000000 = 8.000000 a[612] + b[612] == out[612] is 5.000000 + 5.000000 = 10.000000 a[613] + b[613] == out[613] is 6.000000 + 6.000000 = 12.000000 a[614] + b[614] == out[614] is 7.000000 + 7.000000 = 14.000000 a[615] + b[615] == out[615] is 8.000000 + 8.000000 = 16.000000 a[616] + b[616] == out[616] is 9.000000 + 9.000000 = 18.000000 a[617] + b[617] == out[617] is 10.000000 + 10.000000 = 20.000000 a[618] + b[618] == out[618] is 11.000000 + 11.000000 = 22.000000 a[619] + b[619] == out[619] is 12.000000 + 12.000000 = 24.000000 a[620] + b[620] == out[620] is 13.000000 + 13.000000 = 26.000000 a[621] + b[621] == out[621] is 14.000000 + 14.000000 = 28.000000 a[622] + b[622] == out[622] is 15.000000 + 15.000000 = 30.000000 a[623] + b[623] == out[623] is 16.000000 + 16.000000 = 32.000000 a[624] + b[624] == out[624] is 17.000000 + 17.000000 = 34.000000 a[625] + b[625] == out[625] is 18.000000 + 18.000000 = 36.000000 a[626] + b[626] == out[626] is 19.000000 + 19.000000 = 38.000000 a[627] + b[627] == out[627] is 20.000000 + 20.000000 = 40.000000 a[628] + b[628] == out[628] is 21.000000 + 21.000000 = 42.000000 a[629] + b[629] == out[629] is 22.000000 + 22.000000 = 44.000000 a[630] + b[630] == out[630] is 23.000000 + 23.000000 = 46.000000 a[631] + b[631] == out[631] is 24.000000 + 24.000000 = 48.000000 a[632] + b[632] == out[632] is 25.000000 + 25.000000 = 50.000000 a[633] + b[633] == out[633] is 26.000000 + 26.000000 = 52.000000 a[634] + b[634] == out[634] is 27.000000 + 27.000000 = 54.000000 a[635] + b[635] == out[635] is 28.000000 + 28.000000 = 56.000000 a[636] + b[636] == out[636] is 29.000000 + 29.000000 = 58.000000 a[637] + b[637] == out[637] is 30.000000 + 30.000000 = 60.000000 a[638] + b[638] == out[638] is 31.000000 + 31.000000 = 62.000000 a[639] + b[639] == out[639] is 32.000000 + 32.000000 = 64.000000 a[640] + b[640] == out[640] is 1.000000 + 1.000000 = 2.000000 a[641] + b[641] == out[641] is 2.000000 + 2.000000 = 4.000000 a[642] + b[642] == out[642] is 3.000000 + 3.000000 = 6.000000 a[643] + b[643] == out[643] is 4.000000 + 4.000000 = 8.000000 a[644] + b[644] == out[644] is 5.000000 + 5.000000 = 10.000000 a[645] + b[645] == out[645] is 6.000000 + 6.000000 = 12.000000 a[646] + b[646] == out[646] is 7.000000 + 7.000000 = 14.000000 a[647] + b[647] == out[647] is 8.000000 + 8.000000 = 16.000000 a[648] + b[648] == out[648] is 9.000000 + 9.000000 = 18.000000 a[649] + b[649] == out[649] is 10.000000 + 10.000000 = 20.000000 a[650] + b[650] == out[650] is 11.000000 + 11.000000 = 22.000000 a[651] + b[651] == out[651] is 12.000000 + 12.000000 = 24.000000 a[652] + b[652] == out[652] is 13.000000 + 13.000000 = 26.000000 a[653] + b[653] == out[653] is 14.000000 + 14.000000 = 28.000000 a[654] + b[654] == out[654] is 15.000000 + 15.000000 = 30.000000 a[655] + b[655] == out[655] is 16.000000 + 16.000000 = 32.000000 a[656] + b[656] == out[656] is 17.000000 + 17.000000 = 34.000000 a[657] + b[657] == out[657] is 18.000000 + 18.000000 = 36.000000 a[658] + b[658] == out[658] is 19.000000 + 19.000000 = 38.000000 a[659] + b[659] == out[659] is 20.000000 + 20.000000 = 40.000000 a[660] + b[660] == out[660] is 21.000000 + 21.000000 = 42.000000 a[661] + b[661] == out[661] is 22.000000 + 22.000000 = 44.000000 a[662] + b[662] == out[662] is 23.000000 + 23.000000 = 46.000000 a[663] + b[663] == out[663] is 24.000000 + 24.000000 = 48.000000 a[664] + b[664] == out[664] is 25.000000 + 25.000000 = 50.000000 a[665] + b[665] == out[665] is 26.000000 + 26.000000 = 52.000000 a[666] + b[666] == out[666] is 27.000000 + 27.000000 = 54.000000 a[667] + b[667] == out[667] is 28.000000 + 28.000000 = 56.000000 a[668] + b[668] == out[668] is 29.000000 + 29.000000 = 58.000000 a[669] + b[669] == out[669] is 30.000000 + 30.000000 = 60.000000 a[670] + b[670] == out[670] is 31.000000 + 31.000000 = 62.000000 a[671] + b[671] == out[671] is 32.000000 + 32.000000 = 64.000000 a[672] + b[672] == out[672] is 1.000000 + 1.000000 = 2.000000 a[673] + b[673] == out[673] is 2.000000 + 2.000000 = 4.000000 a[674] + b[674] == out[674] is 3.000000 + 3.000000 = 6.000000 a[675] + b[675] == out[675] is 4.000000 + 4.000000 = 8.000000 a[676] + b[676] == out[676] is 5.000000 + 5.000000 = 10.000000 a[677] + b[677] == out[677] is 6.000000 + 6.000000 = 12.000000 a[678] + b[678] == out[678] is 7.000000 + 7.000000 = 14.000000 a[679] + b[679] == out[679] is 8.000000 + 8.000000 = 16.000000 a[680] + b[680] == out[680] is 9.000000 + 9.000000 = 18.000000 a[681] + b[681] == out[681] is 10.000000 + 10.000000 = 20.000000 a[682] + b[682] == out[682] is 11.000000 + 11.000000 = 22.000000 a[683] + b[683] == out[683] is 12.000000 + 12.000000 = 24.000000 a[684] + b[684] == out[684] is 13.000000 + 13.000000 = 26.000000 a[685] + b[685] == out[685] is 14.000000 + 14.000000 = 28.000000 a[686] + b[686] == out[686] is 15.000000 + 15.000000 = 30.000000 a[687] + b[687] == out[687] is 16.000000 + 16.000000 = 32.000000 a[688] + b[688] == out[688] is 17.000000 + 17.000000 = 34.000000 a[689] + b[689] == out[689] is 18.000000 + 18.000000 = 36.000000 a[690] + b[690] == out[690] is 19.000000 + 19.000000 = 38.000000 a[691] + b[691] == out[691] is 20.000000 + 20.000000 = 40.000000 a[692] + b[692] == out[692] is 21.000000 + 21.000000 = 42.000000 a[693] + b[693] == out[693] is 22.000000 + 22.000000 = 44.000000 a[694] + b[694] == out[694] is 23.000000 + 23.000000 = 46.000000 a[695] + b[695] == out[695] is 24.000000 + 24.000000 = 48.000000 a[696] + b[696] == out[696] is 25.000000 + 25.000000 = 50.000000 a[697] + b[697] == out[697] is 26.000000 + 26.000000 = 52.000000 a[698] + b[698] == out[698] is 27.000000 + 27.000000 = 54.000000 a[699] + b[699] == out[699] is 28.000000 + 28.000000 = 56.000000 a[700] + b[700] == out[700] is 29.000000 + 29.000000 = 58.000000 a[701] + b[701] == out[701] is 30.000000 + 30.000000 = 60.000000 a[702] + b[702] == out[702] is 31.000000 + 31.000000 = 62.000000 a[703] + b[703] == out[703] is 32.000000 + 32.000000 = 64.000000 a[704] + b[704] == out[704] is 1.000000 + 1.000000 = 2.000000 a[705] + b[705] == out[705] is 2.000000 + 2.000000 = 4.000000 a[706] + b[706] == out[706] is 3.000000 + 3.000000 = 6.000000 a[707] + b[707] == out[707] is 4.000000 + 4.000000 = 8.000000 a[708] + b[708] == out[708] is 5.000000 + 5.000000 = 10.000000 a[709] + b[709] == out[709] is 6.000000 + 6.000000 = 12.000000 a[710] + b[710] == out[710] is 7.000000 + 7.000000 = 14.000000 a[711] + b[711] == out[711] is 8.000000 + 8.000000 = 16.000000 a[712] + b[712] == out[712] is 9.000000 + 9.000000 = 18.000000 a[713] + b[713] == out[713] is 10.000000 + 10.000000 = 20.000000 a[714] + b[714] == out[714] is 11.000000 + 11.000000 = 22.000000 a[715] + b[715] == out[715] is 12.000000 + 12.000000 = 24.000000 a[716] + b[716] == out[716] is 13.000000 + 13.000000 = 26.000000 a[717] + b[717] == out[717] is 14.000000 + 14.000000 = 28.000000 a[718] + b[718] == out[718] is 15.000000 + 15.000000 = 30.000000 a[719] + b[719] == out[719] is 16.000000 + 16.000000 = 32.000000 a[720] + b[720] == out[720] is 17.000000 + 17.000000 = 34.000000 a[721] + b[721] == out[721] is 18.000000 + 18.000000 = 36.000000 a[722] + b[722] == out[722] is 19.000000 + 19.000000 = 38.000000 a[723] + b[723] == out[723] is 20.000000 + 20.000000 = 40.000000 a[724] + b[724] == out[724] is 21.000000 + 21.000000 = 42.000000 a[725] + b[725] == out[725] is 22.000000 + 22.000000 = 44.000000 a[726] + b[726] == out[726] is 23.000000 + 23.000000 = 46.000000 a[727] + b[727] == out[727] is 24.000000 + 24.000000 = 48.000000 a[728] + b[728] == out[728] is 25.000000 + 25.000000 = 50.000000 a[729] + b[729] == out[729] is 26.000000 + 26.000000 = 52.000000 a[730] + b[730] == out[730] is 27.000000 + 27.000000 = 54.000000 a[731] + b[731] == out[731] is 28.000000 + 28.000000 = 56.000000 a[732] + b[732] == out[732] is 29.000000 + 29.000000 = 58.000000 a[733] + b[733] == out[733] is 30.000000 + 30.000000 = 60.000000 a[734] + b[734] == out[734] is 31.000000 + 31.000000 = 62.000000 a[735] + b[735] == out[735] is 32.000000 + 32.000000 = 64.000000 a[736] + b[736] == out[736] is 1.000000 + 1.000000 = 2.000000 a[737] + b[737] == out[737] is 2.000000 + 2.000000 = 4.000000 a[738] + b[738] == out[738] is 3.000000 + 3.000000 = 6.000000 a[739] + b[739] == out[739] is 4.000000 + 4.000000 = 8.000000 a[740] + b[740] == out[740] is 5.000000 + 5.000000 = 10.000000 a[741] + b[741] == out[741] is 6.000000 + 6.000000 = 12.000000 a[742] + b[742] == out[742] is 7.000000 + 7.000000 = 14.000000 a[743] + b[743] == out[743] is 8.000000 + 8.000000 = 16.000000 a[744] + b[744] == out[744] is 9.000000 + 9.000000 = 18.000000 a[745] + b[745] == out[745] is 10.000000 + 10.000000 = 20.000000 a[746] + b[746] == out[746] is 11.000000 + 11.000000 = 22.000000 a[747] + b[747] == out[747] is 12.000000 + 12.000000 = 24.000000 a[748] + b[748] == out[748] is 13.000000 + 13.000000 = 26.000000 a[749] + b[749] == out[749] is 14.000000 + 14.000000 = 28.000000 a[750] + b[750] == out[750] is 15.000000 + 15.000000 = 30.000000 a[751] + b[751] == out[751] is 16.000000 + 16.000000 = 32.000000 a[752] + b[752] == out[752] is 17.000000 + 17.000000 = 34.000000 a[753] + b[753] == out[753] is 18.000000 + 18.000000 = 36.000000 a[754] + b[754] == out[754] is 19.000000 + 19.000000 = 38.000000 a[755] + b[755] == out[755] is 20.000000 + 20.000000 = 40.000000 a[756] + b[756] == out[756] is 21.000000 + 21.000000 = 42.000000 a[757] + b[757] == out[757] is 22.000000 + 22.000000 = 44.000000 a[758] + b[758] == out[758] is 23.000000 + 23.000000 = 46.000000 a[759] + b[759] == out[759] is 24.000000 + 24.000000 = 48.000000 a[760] + b[760] == out[760] is 25.000000 + 25.000000 = 50.000000 a[761] + b[761] == out[761] is 26.000000 + 26.000000 = 52.000000 a[762] + b[762] == out[762] is 27.000000 + 27.000000 = 54.000000 a[763] + b[763] == out[763] is 28.000000 + 28.000000 = 56.000000 a[764] + b[764] == out[764] is 29.000000 + 29.000000 = 58.000000 a[765] + b[765] == out[765] is 30.000000 + 30.000000 = 60.000000 a[766] + b[766] == out[766] is 31.000000 + 31.000000 = 62.000000 a[767] + b[767] == out[767] is 32.000000 + 32.000000 = 64.000000 a[768] + b[768] == out[768] is 1.000000 + 1.000000 = 2.000000 a[769] + b[769] == out[769] is 2.000000 + 2.000000 = 4.000000 a[770] + b[770] == out[770] is 3.000000 + 3.000000 = 6.000000 a[771] + b[771] == out[771] is 4.000000 + 4.000000 = 8.000000 a[772] + b[772] == out[772] is 5.000000 + 5.000000 = 10.000000 a[773] + b[773] == out[773] is 6.000000 + 6.000000 = 12.000000 a[774] + b[774] == out[774] is 7.000000 + 7.000000 = 14.000000 a[775] + b[775] == out[775] is 8.000000 + 8.000000 = 16.000000 a[776] + b[776] == out[776] is 9.000000 + 9.000000 = 18.000000 a[777] + b[777] == out[777] is 10.000000 + 10.000000 = 20.000000 a[778] + b[778] == out[778] is 11.000000 + 11.000000 = 22.000000 a[779] + b[779] == out[779] is 12.000000 + 12.000000 = 24.000000 a[780] + b[780] == out[780] is 13.000000 + 13.000000 = 26.000000 a[781] + b[781] == out[781] is 14.000000 + 14.000000 = 28.000000 a[782] + b[782] == out[782] is 15.000000 + 15.000000 = 30.000000 a[783] + b[783] == out[783] is 16.000000 + 16.000000 = 32.000000 a[784] + b[784] == out[784] is 17.000000 + 17.000000 = 34.000000 a[785] + b[785] == out[785] is 18.000000 + 18.000000 = 36.000000 a[786] + b[786] == out[786] is 19.000000 + 19.000000 = 38.000000 a[787] + b[787] == out[787] is 20.000000 + 20.000000 = 40.000000 a[788] + b[788] == out[788] is 21.000000 + 21.000000 = 42.000000 a[789] + b[789] == out[789] is 22.000000 + 22.000000 = 44.000000 a[790] + b[790] == out[790] is 23.000000 + 23.000000 = 46.000000 a[791] + b[791] == out[791] is 24.000000 + 24.000000 = 48.000000 a[792] + b[792] == out[792] is 25.000000 + 25.000000 = 50.000000 a[793] + b[793] == out[793] is 26.000000 + 26.000000 = 52.000000 a[794] + b[794] == out[794] is 27.000000 + 27.000000 = 54.000000 a[795] + b[795] == out[795] is 28.000000 + 28.000000 = 56.000000 a[796] + b[796] == out[796] is 29.000000 + 29.000000 = 58.000000 a[797] + b[797] == out[797] is 30.000000 + 30.000000 = 60.000000 a[798] + b[798] == out[798] is 31.000000 + 31.000000 = 62.000000 a[799] + b[799] == out[799] is 32.000000 + 32.000000 = 64.000000 a[800] + b[800] == out[800] is 1.000000 + 1.000000 = 2.000000 a[801] + b[801] == out[801] is 2.000000 + 2.000000 = 4.000000 a[802] + b[802] == out[802] is 3.000000 + 3.000000 = 6.000000 a[803] + b[803] == out[803] is 4.000000 + 4.000000 = 8.000000 a[804] + b[804] == out[804] is 5.000000 + 5.000000 = 10.000000 a[805] + b[805] == out[805] is 6.000000 + 6.000000 = 12.000000 a[806] + b[806] == out[806] is 7.000000 + 7.000000 = 14.000000 a[807] + b[807] == out[807] is 8.000000 + 8.000000 = 16.000000 a[808] + b[808] == out[808] is 9.000000 + 9.000000 = 18.000000 a[809] + b[809] == out[809] is 10.000000 + 10.000000 = 20.000000 a[810] + b[810] == out[810] is 11.000000 + 11.000000 = 22.000000 a[811] + b[811] == out[811] is 12.000000 + 12.000000 = 24.000000 a[812] + b[812] == out[812] is 13.000000 + 13.000000 = 26.000000 a[813] + b[813] == out[813] is 14.000000 + 14.000000 = 28.000000 a[814] + b[814] == out[814] is 15.000000 + 15.000000 = 30.000000 a[815] + b[815] == out[815] is 16.000000 + 16.000000 = 32.000000 a[816] + b[816] == out[816] is 17.000000 + 17.000000 = 34.000000 a[817] + b[817] == out[817] is 18.000000 + 18.000000 = 36.000000 a[818] + b[818] == out[818] is 19.000000 + 19.000000 = 38.000000 a[819] + b[819] == out[819] is 20.000000 + 20.000000 = 40.000000 a[820] + b[820] == out[820] is 21.000000 + 21.000000 = 42.000000 a[821] + b[821] == out[821] is 22.000000 + 22.000000 = 44.000000 a[822] + b[822] == out[822] is 23.000000 + 23.000000 = 46.000000 a[823] + b[823] == out[823] is 24.000000 + 24.000000 = 48.000000 a[824] + b[824] == out[824] is 25.000000 + 25.000000 = 50.000000 a[825] + b[825] == out[825] is 26.000000 + 26.000000 = 52.000000 a[826] + b[826] == out[826] is 27.000000 + 27.000000 = 54.000000 a[827] + b[827] == out[827] is 28.000000 + 28.000000 = 56.000000 a[828] + b[828] == out[828] is 29.000000 + 29.000000 = 58.000000 a[829] + b[829] == out[829] is 30.000000 + 30.000000 = 60.000000 a[830] + b[830] == out[830] is 31.000000 + 31.000000 = 62.000000 a[831] + b[831] == out[831] is 32.000000 + 32.000000 = 64.000000 a[832] + b[832] == out[832] is 1.000000 + 1.000000 = 2.000000 a[833] + b[833] == out[833] is 2.000000 + 2.000000 = 4.000000 a[834] + b[834] == out[834] is 3.000000 + 3.000000 = 6.000000 a[835] + b[835] == out[835] is 4.000000 + 4.000000 = 8.000000 a[836] + b[836] == out[836] is 5.000000 + 5.000000 = 10.000000 a[837] + b[837] == out[837] is 6.000000 + 6.000000 = 12.000000 a[838] + b[838] == out[838] is 7.000000 + 7.000000 = 14.000000 a[839] + b[839] == out[839] is 8.000000 + 8.000000 = 16.000000 a[840] + b[840] == out[840] is 9.000000 + 9.000000 = 18.000000 a[841] + b[841] == out[841] is 10.000000 + 10.000000 = 20.000000 a[842] + b[842] == out[842] is 11.000000 + 11.000000 = 22.000000 a[843] + b[843] == out[843] is 12.000000 + 12.000000 = 24.000000 a[844] + b[844] == out[844] is 13.000000 + 13.000000 = 26.000000 a[845] + b[845] == out[845] is 14.000000 + 14.000000 = 28.000000 a[846] + b[846] == out[846] is 15.000000 + 15.000000 = 30.000000 a[847] + b[847] == out[847] is 16.000000 + 16.000000 = 32.000000 a[848] + b[848] == out[848] is 17.000000 + 17.000000 = 34.000000 a[849] + b[849] == out[849] is 18.000000 + 18.000000 = 36.000000 a[850] + b[850] == out[850] is 19.000000 + 19.000000 = 38.000000 a[851] + b[851] == out[851] is 20.000000 + 20.000000 = 40.000000 a[852] + b[852] == out[852] is 21.000000 + 21.000000 = 42.000000 a[853] + b[853] == out[853] is 22.000000 + 22.000000 = 44.000000 a[854] + b[854] == out[854] is 23.000000 + 23.000000 = 46.000000 a[855] + b[855] == out[855] is 24.000000 + 24.000000 = 48.000000 a[856] + b[856] == out[856] is 25.000000 + 25.000000 = 50.000000 a[857] + b[857] == out[857] is 26.000000 + 26.000000 = 52.000000 a[858] + b[858] == out[858] is 27.000000 + 27.000000 = 54.000000 a[859] + b[859] == out[859] is 28.000000 + 28.000000 = 56.000000 a[860] + b[860] == out[860] is 29.000000 + 29.000000 = 58.000000 a[861] + b[861] == out[861] is 30.000000 + 30.000000 = 60.000000 a[862] + b[862] == out[862] is 31.000000 + 31.000000 = 62.000000 a[863] + b[863] == out[863] is 32.000000 + 32.000000 = 64.000000 a[864] + b[864] == out[864] is 1.000000 + 1.000000 = 2.000000 a[865] + b[865] == out[865] is 2.000000 + 2.000000 = 4.000000 a[866] + b[866] == out[866] is 3.000000 + 3.000000 = 6.000000 a[867] + b[867] == out[867] is 4.000000 + 4.000000 = 8.000000 a[868] + b[868] == out[868] is 5.000000 + 5.000000 = 10.000000 a[869] + b[869] == out[869] is 6.000000 + 6.000000 = 12.000000 a[870] + b[870] == out[870] is 7.000000 + 7.000000 = 14.000000 a[871] + b[871] == out[871] is 8.000000 + 8.000000 = 16.000000 a[872] + b[872] == out[872] is 9.000000 + 9.000000 = 18.000000 a[873] + b[873] == out[873] is 10.000000 + 10.000000 = 20.000000 a[874] + b[874] == out[874] is 11.000000 + 11.000000 = 22.000000 a[875] + b[875] == out[875] is 12.000000 + 12.000000 = 24.000000 a[876] + b[876] == out[876] is 13.000000 + 13.000000 = 26.000000 a[877] + b[877] == out[877] is 14.000000 + 14.000000 = 28.000000 a[878] + b[878] == out[878] is 15.000000 + 15.000000 = 30.000000 a[879] + b[879] == out[879] is 16.000000 + 16.000000 = 32.000000 a[880] + b[880] == out[880] is 17.000000 + 17.000000 = 34.000000 a[881] + b[881] == out[881] is 18.000000 + 18.000000 = 36.000000 a[882] + b[882] == out[882] is 19.000000 + 19.000000 = 38.000000 a[883] + b[883] == out[883] is 20.000000 + 20.000000 = 40.000000 a[884] + b[884] == out[884] is 21.000000 + 21.000000 = 42.000000 a[885] + b[885] == out[885] is 22.000000 + 22.000000 = 44.000000 a[886] + b[886] == out[886] is 23.000000 + 23.000000 = 46.000000 a[887] + b[887] == out[887] is 24.000000 + 24.000000 = 48.000000 a[888] + b[888] == out[888] is 25.000000 + 25.000000 = 50.000000 a[889] + b[889] == out[889] is 26.000000 + 26.000000 = 52.000000 a[890] + b[890] == out[890] is 27.000000 + 27.000000 = 54.000000 a[891] + b[891] == out[891] is 28.000000 + 28.000000 = 56.000000 a[892] + b[892] == out[892] is 29.000000 + 29.000000 = 58.000000 a[893] + b[893] == out[893] is 30.000000 + 30.000000 = 60.000000 a[894] + b[894] == out[894] is 31.000000 + 31.000000 = 62.000000 a[895] + b[895] == out[895] is 32.000000 + 32.000000 = 64.000000 a[896] + b[896] == out[896] is 1.000000 + 1.000000 = 2.000000 a[897] + b[897] == out[897] is 2.000000 + 2.000000 = 4.000000 a[898] + b[898] == out[898] is 3.000000 + 3.000000 = 6.000000 a[899] + b[899] == out[899] is 4.000000 + 4.000000 = 8.000000 a[900] + b[900] == out[900] is 5.000000 + 5.000000 = 10.000000 a[901] + b[901] == out[901] is 6.000000 + 6.000000 = 12.000000 a[902] + b[902] == out[902] is 7.000000 + 7.000000 = 14.000000 a[903] + b[903] == out[903] is 8.000000 + 8.000000 = 16.000000 a[904] + b[904] == out[904] is 9.000000 + 9.000000 = 18.000000 a[905] + b[905] == out[905] is 10.000000 + 10.000000 = 20.000000 a[906] + b[906] == out[906] is 11.000000 + 11.000000 = 22.000000 a[907] + b[907] == out[907] is 12.000000 + 12.000000 = 24.000000 a[908] + b[908] == out[908] is 13.000000 + 13.000000 = 26.000000 a[909] + b[909] == out[909] is 14.000000 + 14.000000 = 28.000000 a[910] + b[910] == out[910] is 15.000000 + 15.000000 = 30.000000 a[911] + b[911] == out[911] is 16.000000 + 16.000000 = 32.000000 a[912] + b[912] == out[912] is 17.000000 + 17.000000 = 34.000000 a[913] + b[913] == out[913] is 18.000000 + 18.000000 = 36.000000 a[914] + b[914] == out[914] is 19.000000 + 19.000000 = 38.000000 a[915] + b[915] == out[915] is 20.000000 + 20.000000 = 40.000000 a[916] + b[916] == out[916] is 21.000000 + 21.000000 = 42.000000 a[917] + b[917] == out[917] is 22.000000 + 22.000000 = 44.000000 a[918] + b[918] == out[918] is 23.000000 + 23.000000 = 46.000000 a[919] + b[919] == out[919] is 24.000000 + 24.000000 = 48.000000 a[920] + b[920] == out[920] is 25.000000 + 25.000000 = 50.000000 a[921] + b[921] == out[921] is 26.000000 + 26.000000 = 52.000000 a[922] + b[922] == out[922] is 27.000000 + 27.000000 = 54.000000 a[923] + b[923] == out[923] is 28.000000 + 28.000000 = 56.000000 a[924] + b[924] == out[924] is 29.000000 + 29.000000 = 58.000000 a[925] + b[925] == out[925] is 30.000000 + 30.000000 = 60.000000 a[926] + b[926] == out[926] is 31.000000 + 31.000000 = 62.000000 a[927] + b[927] == out[927] is 32.000000 + 32.000000 = 64.000000 a[928] + b[928] == out[928] is 1.000000 + 1.000000 = 2.000000 a[929] + b[929] == out[929] is 2.000000 + 2.000000 = 4.000000 a[930] + b[930] == out[930] is 3.000000 + 3.000000 = 6.000000 a[931] + b[931] == out[931] is 4.000000 + 4.000000 = 8.000000 a[932] + b[932] == out[932] is 5.000000 + 5.000000 = 10.000000 a[933] + b[933] == out[933] is 6.000000 + 6.000000 = 12.000000 a[934] + b[934] == out[934] is 7.000000 + 7.000000 = 14.000000 a[935] + b[935] == out[935] is 8.000000 + 8.000000 = 16.000000 a[936] + b[936] == out[936] is 9.000000 + 9.000000 = 18.000000 a[937] + b[937] == out[937] is 10.000000 + 10.000000 = 20.000000 a[938] + b[938] == out[938] is 11.000000 + 11.000000 = 22.000000 a[939] + b[939] == out[939] is 12.000000 + 12.000000 = 24.000000 a[940] + b[940] == out[940] is 13.000000 + 13.000000 = 26.000000 a[941] + b[941] == out[941] is 14.000000 + 14.000000 = 28.000000 a[942] + b[942] == out[942] is 15.000000 + 15.000000 = 30.000000 a[943] + b[943] == out[943] is 16.000000 + 16.000000 = 32.000000 a[944] + b[944] == out[944] is 17.000000 + 17.000000 = 34.000000 a[945] + b[945] == out[945] is 18.000000 + 18.000000 = 36.000000 a[946] + b[946] == out[946] is 19.000000 + 19.000000 = 38.000000 a[947] + b[947] == out[947] is 20.000000 + 20.000000 = 40.000000 a[948] + b[948] == out[948] is 21.000000 + 21.000000 = 42.000000 a[949] + b[949] == out[949] is 22.000000 + 22.000000 = 44.000000 a[950] + b[950] == out[950] is 23.000000 + 23.000000 = 46.000000 a[951] + b[951] == out[951] is 24.000000 + 24.000000 = 48.000000 a[952] + b[952] == out[952] is 25.000000 + 25.000000 = 50.000000 a[953] + b[953] == out[953] is 26.000000 + 26.000000 = 52.000000 a[954] + b[954] == out[954] is 27.000000 + 27.000000 = 54.000000 a[955] + b[955] == out[955] is 28.000000 + 28.000000 = 56.000000 a[956] + b[956] == out[956] is 29.000000 + 29.000000 = 58.000000 a[957] + b[957] == out[957] is 30.000000 + 30.000000 = 60.000000 a[958] + b[958] == out[958] is 31.000000 + 31.000000 = 62.000000 a[959] + b[959] == out[959] is 32.000000 + 32.000000 = 64.000000 a[960] + b[960] == out[960] is 1.000000 + 1.000000 = 2.000000 a[961] + b[961] == out[961] is 2.000000 + 2.000000 = 4.000000 a[962] + b[962] == out[962] is 3.000000 + 3.000000 = 6.000000 a[963] + b[963] == out[963] is 4.000000 + 4.000000 = 8.000000 a[964] + b[964] == out[964] is 5.000000 + 5.000000 = 10.000000 a[965] + b[965] == out[965] is 6.000000 + 6.000000 = 12.000000 a[966] + b[966] == out[966] is 7.000000 + 7.000000 = 14.000000 a[967] + b[967] == out[967] is 8.000000 + 8.000000 = 16.000000 a[968] + b[968] == out[968] is 9.000000 + 9.000000 = 18.000000 a[969] + b[969] == out[969] is 10.000000 + 10.000000 = 20.000000 a[970] + b[970] == out[970] is 11.000000 + 11.000000 = 22.000000 a[971] + b[971] == out[971] is 12.000000 + 12.000000 = 24.000000 a[972] + b[972] == out[972] is 13.000000 + 13.000000 = 26.000000 a[973] + b[973] == out[973] is 14.000000 + 14.000000 = 28.000000 a[974] + b[974] == out[974] is 15.000000 + 15.000000 = 30.000000 a[975] + b[975] == out[975] is 16.000000 + 16.000000 = 32.000000 a[976] + b[976] == out[976] is 17.000000 + 17.000000 = 34.000000 a[977] + b[977] == out[977] is 18.000000 + 18.000000 = 36.000000 a[978] + b[978] == out[978] is 19.000000 + 19.000000 = 38.000000 a[979] + b[979] == out[979] is 20.000000 + 20.000000 = 40.000000 a[980] + b[980] == out[980] is 21.000000 + 21.000000 = 42.000000 a[981] + b[981] == out[981] is 22.000000 + 22.000000 = 44.000000 a[982] + b[982] == out[982] is 23.000000 + 23.000000 = 46.000000 a[983] + b[983] == out[983] is 24.000000 + 24.000000 = 48.000000 a[984] + b[984] == out[984] is 25.000000 + 25.000000 = 50.000000 a[985] + b[985] == out[985] is 26.000000 + 26.000000 = 52.000000 a[986] + b[986] == out[986] is 27.000000 + 27.000000 = 54.000000 a[987] + b[987] == out[987] is 28.000000 + 28.000000 = 56.000000 a[988] + b[988] == out[988] is 29.000000 + 29.000000 = 58.000000 a[989] + b[989] == out[989] is 30.000000 + 30.000000 = 60.000000 a[990] + b[990] == out[990] is 31.000000 + 31.000000 = 62.000000 a[991] + b[991] == out[991] is 32.000000 + 32.000000 = 64.000000 a[992] + b[992] == out[992] is 1.000000 + 1.000000 = 2.000000 a[993] + b[993] == out[993] is 2.000000 + 2.000000 = 4.000000 a[994] + b[994] == out[994] is 3.000000 + 3.000000 = 6.000000 a[995] + b[995] == out[995] is 4.000000 + 4.000000 = 8.000000 a[996] + b[996] == out[996] is 5.000000 + 5.000000 = 10.000000 a[997] + b[997] == out[997] is 6.000000 + 6.000000 = 12.000000 a[998] + b[998] == out[998] is 7.000000 + 7.000000 = 14.000000 a[999] + b[999] == out[999] is 8.000000 + 8.000000 = 16.000000 a[1000] + b[1000] == out[1000] is 9.000000 + 9.000000 = 18.000000 a[1001] + b[1001] == out[1001] is 10.000000 + 10.000000 = 20.000000 a[1002] + b[1002] == out[1002] is 11.000000 + 11.000000 = 22.000000 a[1003] + b[1003] == out[1003] is 12.000000 + 12.000000 = 24.000000 a[1004] + b[1004] == out[1004] is 13.000000 + 13.000000 = 26.000000 a[1005] + b[1005] == out[1005] is 14.000000 + 14.000000 = 28.000000 a[1006] + b[1006] == out[1006] is 15.000000 + 15.000000 = 30.000000 a[1007] + b[1007] == out[1007] is 16.000000 + 16.000000 = 32.000000 a[1008] + b[1008] == out[1008] is 17.000000 + 17.000000 = 34.000000 a[1009] + b[1009] == out[1009] is 18.000000 + 18.000000 = 36.000000 a[1010] + b[1010] == out[1010] is 19.000000 + 19.000000 = 38.000000 a[1011] + b[1011] == out[1011] is 20.000000 + 20.000000 = 40.000000 a[1012] + b[1012] == out[1012] is 21.000000 + 21.000000 = 42.000000 a[1013] + b[1013] == out[1013] is 22.000000 + 22.000000 = 44.000000 a[1014] + b[1014] == out[1014] is 23.000000 + 23.000000 = 46.000000 a[1015] + b[1015] == out[1015] is 24.000000 + 24.000000 = 48.000000 a[1016] + b[1016] == out[1016] is 25.000000 + 25.000000 = 50.000000 a[1017] + b[1017] == out[1017] is 26.000000 + 26.000000 = 52.000000 a[1018] + b[1018] == out[1018] is 27.000000 + 27.000000 = 54.000000 a[1019] + b[1019] == out[1019] is 28.000000 + 28.000000 = 56.000000 a[1020] + b[1020] == out[1020] is 29.000000 + 29.000000 = 58.000000 a[1021] + b[1021] == out[1021] is 30.000000 + 30.000000 = 60.000000 a[1022] + b[1022] == out[1022] is 31.000000 + 31.000000 = 62.000000 a[1023] + b[1023] == out[1023] is 32.000000 + 32.000000 = 64.000000 Passed
10.3.3. dim 2, grid 1, block NxN for matrix with one dim array
#include <stdio.h> #include <cuda.h> #include <assert.h> #include <cuda_runtime.h> #include <curand_kernel.h> #define n 32 #define N n*n __global__ void add(float *a, float *b, float *c){ int tidx = threadIdx.x; int tidy = threadIdx.y; for(int i = 0; i <= n; i++){ c[tidx + tidy*i] = a[tidx + tidy*i] + b[tidx + tidy*i]; } } int main() { float a[N], b[N], out[N]; float *d_a, *d_b, *d_out; for(int i = 0; i < n; i++){ for (int j = 0; j < n; j++){ a[i+j*n] = 1.0f; b[i+j*n] = 2.0f; } } // Allocate device memory cudaMalloc((void**)&d_a, sizeof(float) * N); cudaMalloc((void**)&d_b, sizeof(float) * N); cudaMalloc((void**)&d_out, sizeof(float) * N); // Transfer data from host to device memory cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice); dim3 BlockPerGrid(1, 1, 1); dim3 ThreadsPerBlock(n, n, 1); add <<< BlockPerGrid, ThreadsPerBlock >>>(d_a, d_b, d_out); cudaMemcpy(out, d_out, sizeof(int) * N, cudaMemcpyDeviceToHost); for(int i=0; i < N; i++){ printf("a[%d] + b[%d] == out[%d] is %f + %f = %f\n", i,i,i, a[i], b[i], out[i]); } printf("Passed\n"); cudaFree(d_out); cudaFree(d_b); cudaFree(d_a); return 0; }
cd babel
nvcc gpu_vector_add_dim2_grid1_blockNxN_matrix_witharray.cu -o gpu_vector_add_dim2_grid1_blockNxN_matrix_witharray
./gpu_vector_add_dim2_grid1_blockNxN_matrix_witharray
Footnotes:
对于新到手的项目可以直接放到容器的nginx根目录(快速部署 /usr/share/nginx/html),可以在映射的本机端口进入
to my case:
sudo docker run -d -p 80:80 -v /home/sx/Documents/Java/ANANProject/layuimini/:/usr/share/nginx/html/ nginx
Beispile
sudo docker images sudo docker pull nginx sudo docker run -d -p 80:80 nginx -> id:xxxxxxx sudo docker exec -it -v /....../project:/usr/share/nginx/html xxxxxxx build in - > local host :80