diff --git a/OpenMPRef-5.0-0519-print.pdf b/OpenMPRef-5.0-0519-print.pdf new file mode 100644 index 0000000000000000000000000000000000000000..b9f51e6a866dbf9c9fdbc57c2b43fb777575cc66 Binary files /dev/null and b/OpenMPRef-5.0-0519-print.pdf differ diff --git a/mandelbrot.f90 b/mandelbrot.f90 new file mode 100644 index 0000000000000000000000000000000000000000..5ad5a65b44e410bb84c3259c55aa17f3eaf7b825 --- /dev/null +++ b/mandelbrot.f90 @@ -0,0 +1,52 @@ +program mandelbrot_area + + implicit none + + integer,parameter::npoints=2000,mxitr=1000 + double precision,parameter:: eps=1.d-5 + + integer i,j,numoutside + double precision area,error,rp,ri + + numoutside=0 + + print *,eps,numoutside + do i=1,npoints + do j=1,npoints + rp=-2.d0+2.5d0*dble(i)/dble(npoints)+eps + ri=1.125d0*dble(j)/dble(npoints)+eps + call testpoint(rp,ri,numoutside,mxitr) + enddo + enddo + area=2.d0*2.5d0*1.125d0*dble(npoints*npoints-numoutside)/dble(npoints*npoints) + error=area/dble(npoints) + + print *,'area: ',area,numoutside + print *,'error: ',error + +end program mandelbrot_area + + +!*************************************************** +subroutine testpoint(rp,ri,numoutside,mxitr) + implicit none + integer i + integer,intent(in)::mxitr + integer,intent(inout)::numoutside + double precision,intent(in)::rp,ri + double precision temp,crp,cri + + crp=rp + cri=ri + do i=1,mxitr + temp=crp*crp-cri*cri+rp + cri=crp*cri*2.d0+ri + crp=temp + if(crp*crp+cri*cri.gt.4.d0) then + numoutside=numoutside+1 + exit + endif + enddo + + return +end subroutine testpoint diff --git a/matmul/matmul_cpp/Makefile b/matmul/matmul_cpp/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..908f08c5e34afc3110ae61d908714175f4678f15 --- /dev/null +++ b/matmul/matmul_cpp/Makefile @@ -0,0 +1,51 @@ +CXX=g++ +CXXFLAGS=-O2 -Wall -fopenmp +NVCC=nvcc +NVCCFLAGS=-O2 +RUN= + +RANGE=64,128,256,512,1024#,2048,4096,8192 +NTHREADS=4 + +N=64 + +all: naive order threads target cuda + +naive: + $(CXX) $(CXXFLAGS) -o naive naive.cpp + +order: + $(CXX) $(CXXFLAGS) -o order order.cpp + +threads: + $(CXX) $(CXXFLAGS) -o threads threads.cpp + +target: + $(CXX) $(CXXFLAGS) -o target target.cpp + +cuda: + $(NVCC) $(NVCCFLAGS) -o cuda cuda.cu + +.PHONY: clean run test + +clean: + rm -f naive order factorized threads target cuda + rm -f time.*.txt + +run: clean all + @echo -n "naive: " && $(RUN) ./naive $N + @echo -n "order: " && $(RUN) ./order $N + @echo -n "threads: " && $(RUN) ./threads $N + @echo -n "target: " && $(RUN) ./target $N + @echo -n "cuda: " && $(RUN) ./cuda $N + +test: clean all + rm -f time.*.txt + export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./naive $$i >> time.naive.txt; done + export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./order $$i >> time.order.txt; done + export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./threads $$i >> time.threads.txt; done + export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./target $$i >> time.target.txt; done + export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./cuda $$i >> time.cuda.txt; done + +plot: + python plot.py \ No newline at end of file diff --git a/matmul/matmul_cpp/cuda.cu b/matmul/matmul_cpp/cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..3edc2c0f3ffe250cb520f2c8284a7fc1760f4327 --- /dev/null +++ b/matmul/matmul_cpp/cuda.cu @@ -0,0 +1,72 @@ +#include <cstdio> +#include "utils.hpp" + +__global__ void cuda_mul(float* m1, float* m2, float* m3, int n) { + + int row = blockIdx.y*blockDim.y+threadIdx.y; + int col = blockIdx.x*blockDim.x+threadIdx.x; + + for (int i = 0; i < n; i++) + m3[row * n + col] += m1[row * n + i] * m2[i * n + col]; +} + +void mul(const float *m1, const float *m2, float *m3, int n) +{ + float *dm1 = nullptr, *dm2 = nullptr, *dm3 = nullptr; + + cudaMalloc((void**)&dm1, sizeof(float) * n * n); + cudaMalloc((void**)&dm2, sizeof(float) * n * n); + cudaMalloc((void**)&dm3, sizeof(float) * n * n); + + cudaMemcpy(dm1, m1, sizeof(float) * n * n, cudaMemcpyHostToDevice); + cudaMemcpy(dm2, m2, sizeof(float) * n * n, cudaMemcpyHostToDevice); + cudaMemcpy(dm3, m3, sizeof(float) * n * n, cudaMemcpyHostToDevice); + + dim3 blockSize = dim3(16, 16); + dim3 gridSize = dim3(n / blockSize.x, n/ blockSize.y); + + cuda_mul<<<gridSize, blockSize>>>(dm1, dm2, dm3, n); + + cudaMemcpy(m3, dm3, sizeof(float) * n * n, cudaMemcpyDeviceToHost); + + cudaFree(dm1); + cudaFree(dm2); + cudaFree(dm3); +} + +int main(int argc, char **argv) +{ + int n = 64; + if(argc > 1) + n = atoi(argv[1]); + + srand(12); + + float *m1 = nullptr, *m2 = nullptr, *m3 = nullptr; + + cudaError_t err = cudaMallocHost((void**)&m1, sizeof(float) * n * n); + if(err) printf("Error status is %s\n", cudaGetErrorString(err)); + err = cudaMallocHost((void**)&m2, sizeof(float) * n * n); + if(err) printf("Error status is %s\n", cudaGetErrorString(err)); + err = cudaMallocHost((void**)&m3, sizeof(float) * n * n); + if(err) printf("Error status is %s\n", cudaGetErrorString(err)); + + random(m1, n); + random(m2, n); + random(m3, n); + + zeros(m3, n); + + TimePoint tstart, tstop; + + tstart = Clock::now(); + mul(m1, m2, m3, n); + tstop = Clock::now(); + + //printf("checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop)); + printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop)); + + cudaFreeHost(m1); + cudaFreeHost(m2); + cudaFreeHost(m3); +} diff --git a/matmul/matmul_cpp/naive.cpp b/matmul/matmul_cpp/naive.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0e2d02b084759826fdaba1f52d5d3819f56c7553 --- /dev/null +++ b/matmul/matmul_cpp/naive.cpp @@ -0,0 +1,42 @@ +#include <cstdio> +#include "utils.hpp" + +void mul(const float *m1, const float *m2, float *m3, int n) +{ + for(int i=0; i<n; ++i) + for(int j=0; j<n; ++j) + for(int k=0; k<n; ++k) + m3[i*n+j] += m1[i*n + k] * m2[k*n+j]; +} + +int main(int argc, char **argv) +{ + int n = 64; + if(argc > 1) + n = atoi(argv[1]); + + srand(12); + + float *m1 = new float[n*n]; + float *m2 = new float[n*n]; + float *m3 = new float[n*n]; + + random(m1, n); + random(m2, n); + random(m3, n); + + zeros(m3, n); + + TimePoint tstart, tstop; + + tstart = Clock::now(); + mul(m1, m2, m3, n); + tstop = Clock::now(); + + //printf("Checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop)); + printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop)); + + delete[] m1; + delete[] m2; + delete[] m3; +} diff --git a/matmul/matmul_cpp/order.cpp b/matmul/matmul_cpp/order.cpp new file mode 100644 index 0000000000000000000000000000000000000000..656867d2a17078f611391a3d3233ebef21731b21 --- /dev/null +++ b/matmul/matmul_cpp/order.cpp @@ -0,0 +1,42 @@ +#include <cstdio> +#include "utils.hpp" + +void mul(const float *m1, const float *m2, float *m3, int n) +{ + for(int i=0; i<n; ++i) + for(int k=0; k<n; ++k) + for(int j=0; j<n; ++j) + m3[ i * n + j] += m1[i * n + k] * m2[k * n + j]; +} + +int main(int argc, char **argv) +{ + int n = 64; + if(argc > 1) + n = atoi(argv[1]); + + srand(12); + + float *m1 = new float[n*n]; + float *m2 = new float[n*n]; + float *m3 = new float[n*n]; + + random(m1, n); + random(m2, n); + random(m3, n); + + zeros(m3, n); + + TimePoint tstart, tstop; + + tstart = Clock::now(); + mul(m1, m2, m3, n); + tstop = Clock::now(); + + //printf("Checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop)); + printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop)); + + delete[] m1; + delete[] m2; + delete[] m3; +} diff --git a/matmul/matmul_cpp/plot.py b/matmul/matmul_cpp/plot.py new file mode 100644 index 0000000000000000000000000000000000000000..0ecf012148c9dfe254dfc74d27818ba8bc3bb3cd --- /dev/null +++ b/matmul/matmul_cpp/plot.py @@ -0,0 +1,31 @@ +import os +import math +import numpy as np +from matplotlib import pyplot as plt + +cuda = np.loadtxt('time.cuda.txt') +target = np.loadtxt('time.target.txt') +threads = np.loadtxt('time.threads.txt') +order = np.loadtxt('time.order.txt') +naive = np.loadtxt('time.naive.txt') + +l = len(naive) +n = [64 * 2**i for i in range(0, l)] + +print(l, n) + +plt.figure(figsize=(5,4)) +if cuda.any(): plt.plot(n, cuda[:,1], marker='x', linestyle='-', linewidth=2, markersize=6, color='C0', label=r"CUDA")#($f_{child} = 2, n_{p} = 64$)") +if target.any(): plt.plot(n, target[:,1], marker='v', linestyle='-', linewidth=2, markersize=6, color='C1', label=r"OpenMP Offloading")#($f_{child} = 2, n_{p} = 64$)") +if threads.any(): plt.plot(n, threads[:,1], marker='o', linestyle='-', linewidth=2, markersize=6, color='C2', label=r"OpenMP Threading")#($f_{child} = 2, n_{p} = 64$)") +if order.any(): plt.plot(n, order[:,1], marker='>', linestyle='-', linewidth=2, markersize=6, color='C4', label=r"Loop Order")#($f_{child} = 2, n_{p} = 64$)") +if naive.any(): plt.plot(n, naive[:,1], marker='<', linestyle='-', linewidth=2, markersize=6, color='C5', label=r"Naive")#($f_{child} = 2, n_{p} = 64$)") +plt.legend(loc='upper left') +plt.xscale('log') +plt.yscale('log') +plt.xlim(n[0], n[-1]) +plt.xticks(n, n) +plt.yticks() +plt.draw() + +plt.show() \ No newline at end of file diff --git a/matmul/matmul_cpp/target.cpp b/matmul/matmul_cpp/target.cpp new file mode 100644 index 0000000000000000000000000000000000000000..100ba31c7d65e2be40ce435cd4d0382695548fec --- /dev/null +++ b/matmul/matmul_cpp/target.cpp @@ -0,0 +1,50 @@ +#include <cstdio> +#include "utils.hpp" + +void mul(const float *m1, const float *m2, float *m3, int n) +{ + const int n2 = n * n; + + #pragma omp target map(to: m1[0:n2], m2[0:n2]), map(tofrom: m3[0:n2]) + #pragma omp teams distribute parallel for collapse(2) + for(int i=0; i<n; ++i) + { + for(int j=0; j<n; ++j) + { + for (int k = 0; k < n; k++) + m3[i * n + j] += m1[i * n + k] * m2[k * n + j]; + } + } +} + +int main(int argc, char **argv) +{ + int n = 64; + if(argc > 1) + n = atoi(argv[1]); + + srand(12); + + float *m1 = new float[n*n]; + float *m2 = new float[n*n]; + float *m3 = new float[n*n]; + + random(m1, n); + random(m2, n); + random(m3, n); + + zeros(m3, n); + + TimePoint tstart, tstop; + + tstart = Clock::now(); + mul(m1, m2, m3, n); + tstop = Clock::now(); + + //printf("Checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop)); + printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop)); + + delete[] m1; + delete[] m2; + delete[] m3; +} diff --git a/matmul/matmul_cpp/threads.cpp b/matmul/matmul_cpp/threads.cpp new file mode 100644 index 0000000000000000000000000000000000000000..21ba4de1113a875f75a640adf88af5e04eda152b --- /dev/null +++ b/matmul/matmul_cpp/threads.cpp @@ -0,0 +1,43 @@ +#include <cstdio> +#include "utils.hpp" + +void mul(const float *m1, const float *m2, float *m3, int n) +{ + #pragma omp parallel for + for(int i=0; i<n; ++i) + for(int k=0; k<n; ++k) + for(int j=0; j<n; ++j) + m3[ i * n + j] += m1[i * n + k] * m2[k * n + j]; +} + +int main(int argc, char **argv) +{ + int n = 64; + if(argc > 1) + n = atoi(argv[1]); + + srand(12); + + float *m1 = new float[n*n]; + float *m2 = new float[n*n]; + float *m3 = new float[n*n]; + + random(m1, n); + random(m2, n); + random(m3, n); + + zeros(m3, n); + + TimePoint tstart, tstop; + + tstart = Clock::now(); + mul(m1, m2, m3, n); + tstop = Clock::now(); + + //printf("Checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop)); + printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop)); + + delete[] m1; + delete[] m2; + delete[] m3; +} diff --git a/matmul/matmul_cpp/utils.hpp b/matmul/matmul_cpp/utils.hpp new file mode 100644 index 0000000000000000000000000000000000000000..579155955b3f80d92b8d0f88658dc027235095a0 --- /dev/null +++ b/matmul/matmul_cpp/utils.hpp @@ -0,0 +1,32 @@ +#include <ctime> +#include <cstdlib> +#include <chrono> + +typedef std::chrono::high_resolution_clock Clock; +typedef std::chrono::time_point<Clock> TimePoint; +typedef std::chrono::duration<double> Time; + +void zeros(float *m, int n) +{ + for(int i=0; i<n*n; ++i) + m[i] = 0; +} + +void random(float *m, int n) +{ + for(int i=0; i<n*n; ++i) + m[i] = (rand() % 1000 - 500) / 1000.0; +} + +float checksum(const float *m3, int n) +{ + float sum = 0.0f; + for(int i=0; i<n*n; i++) + sum += m3[i]; + return sum; +} + +double elapsedTime(TimePoint start, TimePoint stop) +{ + return std::chrono::duration_cast<Time>(stop-start).count(); +} diff --git a/matmul/matmul_f90/mat_mul.f90 b/matmul/matmul_f90/mat_mul.f90 new file mode 100644 index 0000000000000000000000000000000000000000..9f5741cc76b977cd0f3a744184720f73373c4d72 --- /dev/null +++ b/matmul/matmul_f90/mat_mul.f90 @@ -0,0 +1,80 @@ + program matmul_prog + + implicit none + + integer icrow,iccol + integer ic,i,j + integer,parameter :: a_row=840, a_col=3230, a_elem=a_row*a_col + integer,parameter :: b_row=3230, b_col=802, b_elem=b_row*b_col + integer,parameter :: c_row=a_row, c_col=b_col, c_elem=c_row*c_col + + integer, dimension(12) :: seed=(/3,6,1,2,8,4,2,9,0,4,5,6/) + + double precision, dimension(a_row,a_col) :: a + double precision, dimension(b_row,b_col) :: b + double precision, dimension(c_row,c_col) :: c,c_ref + double precision, dimension(a_elem) :: a_temp + double precision, dimension(b_elem) :: b_temp + + double precision num,sum,t1,t2 + + logical, parameter :: validation=.false. + logical, parameter :: print_result=.false. + + if(a_col.ne.b_row) stop 'Wrong matrices dimensions' + + !generate initial arrays + call random_seed(put=seed) + + do i=1,a_elem + call random_number(num) + a_temp(i)=10.d0*num+1.d0 + enddo + do i=1,b_elem + call random_number(num) + b_temp(i)=10.d0*num+1.d0 + enddo + a = transpose(reshape(a_temp,(/ size(a, 2), size(a, 1) /))) + b = transpose(reshape(b_temp,(/ size(b, 2), size(b, 1) /))) + + + call cpu_time(t1) + !matrix multiplication + c=0.d0 + do ic=1,c_elem + icrow=mod(ic-1,c_row)+1 + iccol=(ic-1)/c_row+1 + do i=1,a_col + c(icrow,iccol)=c(icrow,iccol)+a(icrow,i)*b(i,iccol) + enddo + enddo + call cpu_time(t2) + + print *,'Elapsed time:',t2-t1 + + !Print result + if(print_result) then + do icrow=1,c_row + print '(20(1x,f12.5))', (c(icrow,iccol),iccol=1,c_col) + enddo + endif + + + !Validation + + if(validation) then + c_ref=matmul(a,b) + + do i=1,c_row + do j=1,c_col + sum=sum+c(i,j)-c_ref(i,j) + enddo + enddo + print *,sum,sum/dble(c_elem) + endif + + end program matmul_prog + + + + diff --git a/matmul/matmul_f90/mat_mul_gpu.f90 b/matmul/matmul_f90/mat_mul_gpu.f90 new file mode 100644 index 0000000000000000000000000000000000000000..0dbfa52db21183807e12bc8bc3a6cf74ec02fb46 --- /dev/null +++ b/matmul/matmul_f90/mat_mul_gpu.f90 @@ -0,0 +1,87 @@ + program matmul_prog + + use omp_lib + + implicit none + + integer icrow,iccol + integer ic,i,j,num_devices + integer,parameter :: a_row=840, a_col=3230, a_elem=a_row*a_col + integer,parameter :: b_row=3230, b_col=802, b_elem=b_row*b_col + integer,parameter :: c_row=a_row, c_col=b_col, c_elem=c_row*c_col + + integer, dimension(12) :: seed=(/3,6,1,2,8,4,2,9,0,4,5,6/) + + double precision, dimension(a_row,a_col) :: a + double precision, dimension(b_row,b_col) :: b + double precision, dimension(c_row,c_col) :: c,c_ref + double precision, dimension(a_elem) :: a_temp + double precision, dimension(b_elem) :: b_temp + + double precision num,sum,t1,t2 + + logical, parameter :: validation=.true. + logical, parameter :: print_result=.false. + + if(a_col.ne.b_row) stop 'Wrong matrices dimensions' + + num_devices = omp_get_num_devices() + + !generate initial arrays + call random_seed(put=seed) + + do i=1,a_elem + call random_number(num) + a_temp(i)=10.d0*num+1.d0 + enddo + do i=1,b_elem + call random_number(num) + b_temp(i)=10.d0*num+1.d0 + enddo + a = transpose(reshape(a_temp,(/ size(a, 2), size(a, 1) /))) + b = transpose(reshape(b_temp,(/ size(b, 2), size(b, 1) /))) + + + t1=OMP_GET_WTIME() + !matrix multiplication + c=0.d0 + !$omp target map(tofrom:c) map(to:a,b) + !$omp teams distribute parallel do private(ic,icrow,iccol,i) + do ic=1,c_elem + icrow=mod(ic-1,c_row)+1 + iccol=(ic-1)/c_row+1 + do i=1,a_col + c(icrow,iccol)=c(icrow,iccol)+a(icrow,i)*b(i,iccol) + enddo + enddo + !$omp end target + t2=OMP_GET_WTIME() + + print *,'Elapsed time:',t2-t1 + + !Print result + if(print_result) then + do icrow=1,c_row + print '(20(1x,f12.5))', (c(icrow,iccol),iccol=1,c_col) + enddo + endif + + + !Validation + + if(validation) then + c_ref=matmul(a,b) + + do i=1,c_row + do j=1,c_col + sum=sum+c(i,j)-c_ref(i,j) + enddo + enddo + print *,sum,sum/dble(c_elem) + endif + + end program matmul_prog + + + + diff --git a/matmul/matmul_f90/mat_mul_omp.f90 b/matmul/matmul_f90/mat_mul_omp.f90 new file mode 100644 index 0000000000000000000000000000000000000000..cce6ba1f1ddfee03f82a1078b015bbb187e09749 --- /dev/null +++ b/matmul/matmul_f90/mat_mul_omp.f90 @@ -0,0 +1,84 @@ + program matmul_prog + + implicit none + + integer icrow,iccol + integer ic,i,j + integer,parameter :: a_row=840, a_col=3230, a_elem=a_row*a_col + integer,parameter :: b_row=3230, b_col=802, b_elem=b_row*b_col + integer,parameter :: c_row=a_row, c_col=b_col, c_elem=c_row*c_col + + integer, dimension(12) :: seed=(/3,6,1,2,8,4,2,9,0,4,5,6/) + + double precision, dimension(a_row,a_col) :: a + double precision, dimension(b_row,b_col) :: b + double precision, dimension(c_row,c_col) :: c,c_ref + double precision, dimension(a_elem) :: a_temp + double precision, dimension(b_elem) :: b_temp + + double precision num,sum,t1,t2,OMP_GET_WTIME + + logical, parameter :: validation=.true. + logical, parameter :: print_result=.false. + + if(a_col.ne.b_row) stop 'Wrong matrices dimensions' + + !generate initial arrays + call random_seed(put=seed) + + do i=1,a_elem + call random_number(num) + a_temp(i)=10.d0*num+1.d0 + enddo + do i=1,b_elem + call random_number(num) + b_temp(i)=10.d0*num+1.d0 + enddo + a = transpose(reshape(a_temp,(/ size(a, 2), size(a, 1) /))) + b = transpose(reshape(b_temp,(/ size(b, 2), size(b, 1) /))) + + + t1=OMP_GET_WTIME() + !matrix multiplication + c=0.d0 + !$omp parallel private(ic,icrow,iccol,i) + !$omp do schedule(static) + do ic=1,c_elem + icrow=mod(ic-1,c_row)+1 + iccol=(ic-1)/c_row+1 + do i=1,a_col + c(icrow,iccol)=c(icrow,iccol)+a(icrow,i)*b(i,iccol) + enddo + enddo + !$omp end do + !$omp end parallel + t2=OMP_GET_WTIME() + + print *,'Elapsed time:',t2-t1 + + !Print result + if(print_result) then + do icrow=1,c_row + print '(20(1x,f12.5))', (c(icrow,iccol),iccol=1,c_col) + enddo + endif + + + !Validation + + if(validation) then + c_ref=matmul(a,b) + + do i=1,c_row + do j=1,c_col + sum=sum+c(i,j)-c_ref(i,j) + enddo + enddo + print *,sum,sum/dble(c_elem) + endif + + end program matmul_prog + + + + diff --git a/matmul/matmul_python/mat_mul_python_gpu.py b/matmul/matmul_python/mat_mul_python_gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..4e3efe85e06434c6c449450e74cc5c8d84b69e82 --- /dev/null +++ b/matmul/matmul_python/mat_mul_python_gpu.py @@ -0,0 +1,51 @@ +import numpy as np +import time +from numpy.random import seed +from numpy.random import rand +from numba import jit,cuda + +size_array=4096 +size_vec=size_array*size_array +validate=True + +def generate_array(num,size): + seed(num) + lst=10.*rand(size)+1. + return lst + + +a=np.array(generate_array(1,size_vec)) +b=np.array(generate_array(2,size_vec)) +c=np.zeros(size_vec) + +@cuda.jit +def mat_mul(a,b,c,size): + row=cuda.blockIdx.y*cuda.blockDim.y+cuda.threadIdx.y + col=cuda.blockIdx.x*cuda.blockDim.x+cuda.threadIdx.x + for i in range(size): + c[row*size+col]+=a[row*size+i]*b[i*size+col] + + +threadsperblock = (16,16) +blockspergrid_x = int(np.ceil(size_array / threadsperblock[0])) +blockspergrid_y = int(np.ceil(size_array / threadsperblock[1])) +blockspergrid = (blockspergrid_x, blockspergrid_y) +start=time.time() +mat_mul[blockspergrid, threadsperblock](a,b,c,size_array) +end=time.time() +print('Elapsed time: ',end-start) + +c=np.zeros(size_vec) +start=time.time() +mat_mul[blockspergrid, threadsperblock](a,b,c,size_array) +end=time.time() +print('Elapsed time: ',end-start) + +if(validate): + a2d=np.reshape(a,(size_array,-1)) + b2d=np.reshape(b,(size_array,-1)) + c2d_numba=np.reshape(c,(size_array,-1)) + c2d=np.matmul(a2d,b2d) + dif=np.abs(c2d_numba-c2d) + summation=np.sum(dif) + print(summation,summation/float(size_vec)) diff --git a/matmul/matmul_python/mat_mul_python_numba.py b/matmul/matmul_python/mat_mul_python_numba.py new file mode 100644 index 0000000000000000000000000000000000000000..ce23caa7db43844de30055c9f9feee155f3ada6c --- /dev/null +++ b/matmul/matmul_python/mat_mul_python_numba.py @@ -0,0 +1,41 @@ +import numpy as np +import time +from numpy.random import seed +from numpy.random import rand +from numba import jit,prange + +size_array=4096 +size_vec=size_array*size_array + +def generate_array(num,size): + seed(num) + lst=10.*rand(size)+1. + return lst + + +a=np.array(generate_array(1,size_vec)) +b=np.array(generate_array(2,size_vec)) +c=np.zeros(size_vec) + +@jit +def mat_mul(a,b,c,size): + for i in range(size): + for k in range(size): + for j in range(size): + c[i*size+j]+=a[i*size+k]+b[k*size+j] + +start=time.time() +c=mat_mul(a,b,c,size_array) +end=time.time() +print('Elapsed time: ',end-start) + +c=np.zeros(size_vec) + +start=time.time() +c=mat_mul(a,b,c,size_array) +end=time.time() + + + +print('Elapsed time: ',end-start) + diff --git a/matmul/matmul_python/mat_mul_python_threads.py b/matmul/matmul_python/mat_mul_python_threads.py new file mode 100644 index 0000000000000000000000000000000000000000..e0645050efc7cd55a199404232d6a0862f072f2a --- /dev/null +++ b/matmul/matmul_python/mat_mul_python_threads.py @@ -0,0 +1,41 @@ +import numpy as np +import time +from numpy.random import seed +from numpy.random import rand +from numba import jit,prange + +size_array=4096 +size_vec=size_array*size_array + +def generate_array(num,size): + seed(num) + lst=10.*rand(size)+1. + return lst + + +a=np.array(generate_array(1,size_vec)) +b=np.array(generate_array(2,size_vec)) +c=np.zeros(size_vec) + +@jit(parallel=True) +def mat_mul(a,b,c,size): + for i in prange(size): + for k in range(size): + for j in range(size): + c[i*size+j]+=a[i*size+k]+b[k*size+j] + +start=time.time() +c=mat_mul(a,b,c,size_array) +end=time.time() +print('Elapsed time: ',end-start) + +c=np.zeros(size_vec) + +start=time.time() +c=mat_mul(a,b,c,size_array) +end=time.time() + + + +print('Elapsed time: ',end-start) + diff --git a/montecarlo_pi.f90 b/montecarlo_pi.f90 new file mode 100644 index 0000000000000000000000000000000000000000..10c60291438a8bcec1348b67192d360aaefd1798 --- /dev/null +++ b/montecarlo_pi.f90 @@ -0,0 +1,22 @@ +program montecarlo_pi + + implicit none + + integer i,countin + integer,parameter::npoints=100000000 + double precision num,pi,pi_true,x,y,random_last + + pi_true=acos(-1.d0) + countin=0 + random_last=0 + do i=1,npoints + call random_number(x) + call random_number(y) + if(x*x+y*y.le.1.d0)countin=countin+1 + enddo + + pi=dble(countin)/dble(npoints)*4.d0 + + print *,'Calculated: ',pi,'Real: ',pi_true + +end program montecarlo_pi diff --git a/montecarlo_pi_v2.f90 b/montecarlo_pi_v2.f90 new file mode 100644 index 0000000000000000000000000000000000000000..177b785271c114deeb7608ca29e82fceb59f4192 --- /dev/null +++ b/montecarlo_pi_v2.f90 @@ -0,0 +1,39 @@ +program montecarlo_pi_v2 + + implicit none + + integer i,countin,id + integer,parameter::npoints=100000000 + double precision num,pi,pi_true,x,y + integer random_last + + pi_true=acos(-1.d0) + countin=0 + random_last=0 + do i=1,npoints + call LCG_random(x,random_last) + call LCG_random(y,random_last) + if(x*x+y*y.le.1.d0)countin=countin+1 + enddo + + pi=dble(countin)/dble(npoints)*4.d0 + + print *,pi,pi_true + +end program montecarlo_pi_v2 + + +!*************************************************** +subroutine LCG_random(num,random_last) + + implicit none + integer,parameter::multiplier=1366,addend=150889,pmod=714025 + integer,intent(inout)::random_last + double precision num + + num=dble(mod(multiplier*random_last+addend,pmod)) + random_last=num + num=num/dble(pmod) + + return +end subroutine LCG_random diff --git a/openmp_2020_print.pdf b/openmp_2020_print.pdf new file mode 100644 index 0000000000000000000000000000000000000000..37485482bc37339e1832c461f61814faab089bcc Binary files /dev/null and b/openmp_2020_print.pdf differ diff --git a/pi.f90 b/pi.f90 new file mode 100644 index 0000000000000000000000000000000000000000..1a219a1090f3d0709f7c1e849587f57ba54678ff --- /dev/null +++ b/pi.f90 @@ -0,0 +1,23 @@ +program pi_serial + + implicit none + + + integer, parameter::steps=1000000000 + integer i + double precision dx,x,sum,pi + + pi=acos(-1.d0) + dx=1.d0/dble(steps) + sum=0.d0 + + do i=1,steps + x=(dble(i)-.5d0)*dx + sum=sum+4.d0/(1.d0+x*x) + enddo + + sum=sum*dx + + print *,sum,abs(pi-sum)/pi + +end program pi_serial diff --git a/python_multiproc.py b/python_multiproc.py new file mode 100644 index 0000000000000000000000000000000000000000..9994164a316f55f541a2c4816347337b146195ee --- /dev/null +++ b/python_multiproc.py @@ -0,0 +1,48 @@ +from multiprocessing import Process, Manager + +def func1(n,j): + j=0 + print('func1: starting (from 1 to 100000000)') + for i in range(1,100000001): + j=j+i/2 + print('func1: finishing',j) + results[n]=j + +def func2(n,j): + j=0 + print('func2: starting (from 100000001 to 200000000)') + for i in range(100000001,200000001): + j=j+i/2 + print('func2: finishing',j) + results[n]=j + +def func3(n,j): + j=0 + print('func3: starting (from 200000001 to 300000000)') + for i in range(200000001,300000001): + j=j+i/2 + print('func3: finishing',j) + results[n]=j + +def runInParallel(*fns): + proc=[] + for fn in fns: + p = Process(target=fn) + p.start() + proc.append(p) + for p in proc: + p.join() + +if __name__ == '__main__': + results=Manager().dict() + p1=Process(target=func1,args=(0,results)) + p1.start() + p2=Process(target=func2,args=(1,results)) + p2.start() + p3=Process(target=func3,args=(2,results)) + p3.start() + p1.join() + p2.join() + p3.join() + print (results) + print (sum(results.values())) diff --git a/python_multiproc_2.py b/python_multiproc_2.py new file mode 100644 index 0000000000000000000000000000000000000000..e58edd47d6586d1cb5142006a3beea400db3f4ab --- /dev/null +++ b/python_multiproc_2.py @@ -0,0 +1,45 @@ +from multiprocessing import Process,Manager + +def func1(id,results): + j=0 + print('func1: starting (from 1 to 100000000)') + for i in range(1,100000001): + j=j+i/2 + print('func1: finishing') + results[id]=j + +def func2(id,results): + j=0 + print('func2: starting (from 100000001 to 200000000)') + for i in range(100000001,200000001): + j=j+i/2 + print('func2: finishing') + results[id]=j + +def func3(id,results): + j=0 + print('func3: starting (from 200000001 to 300000000)') + for i in range(200000001,300000001): + j=j+i/2 + print('func3: finishing') + results[id]=j + +def runInParallel(*fns): + proc=[] + for i,fn in enumerate(fns): + p = Process(target=fn,args=(i,results)) + p.start() + proc.append(p) + for p in proc: + p.join() + print(results) + print(sum(results.values())) + + +if __name__ == '__main__': + results=Manager().dict() + runInParallel (func1,func2,func3) + + + + diff --git a/python_numba.py b/python_numba.py new file mode 100644 index 0000000000000000000000000000000000000000..73060bd670ae66724e10abe68f2e9232729f28a7 --- /dev/null +++ b/python_numba.py @@ -0,0 +1,12 @@ +from numba import jit + +@jit() +def func1(): + j=0 + print('func1: starting') + for i in range(1,300000001): + j=j+i/2 + print('func1: finishing',j) + +if __name__ == '__main__': + func1() diff --git a/python_serial.py b/python_serial.py new file mode 100644 index 0000000000000000000000000000000000000000..6e8597cfcc6829369b9026d01a87f173b07626b1 --- /dev/null +++ b/python_serial.py @@ -0,0 +1,13 @@ +def func1(): + j=0 + print('func: starting (from 1 to 300000000)') + for i in range(1,300000001): + j=j+i/2 + print('func1: finishing',j) + +if __name__ == '__main__': + func1() + + + +