course 2020

a707aab2 · Ruben · 2b021eb1 · a707aab2 · a707aab2 · a707aab2
Commit a707aab2 authored 5 years ago by Ruben
--- a/OpenMPRef-5.0-0519-print.pdf
+++ b/OpenMPRef-5.0-0519-print.pdf
--- a/mandelbrot.f90
+++ b/mandelbrot.f90
+program mandelbrot_area
+
+  implicit none
+
+  integer,parameter::npoints=2000,mxitr=1000
+  double precision,parameter:: eps=1.d-5
+
+  integer i,j,numoutside
+  double precision area,error,rp,ri
+
+  numoutside=0
+
+  print *,eps,numoutside
+  do i=1,npoints
+     do j=1,npoints
+        rp=-2.d0+2.5d0*dble(i)/dble(npoints)+eps
+        ri=1.125d0*dble(j)/dble(npoints)+eps
+        call testpoint(rp,ri,numoutside,mxitr)
+     enddo
+  enddo
+  area=2.d0*2.5d0*1.125d0*dble(npoints*npoints-numoutside)/dble(npoints*npoints)
+  error=area/dble(npoints)
+
+  print *,'area: ',area,numoutside
+  print *,'error: ',error
+
+end program mandelbrot_area
+
+
+!***************************************************
+subroutine testpoint(rp,ri,numoutside,mxitr)
+  implicit none
+  integer i
+  integer,intent(in)::mxitr
+  integer,intent(inout)::numoutside
+  double precision,intent(in)::rp,ri
+  double precision temp,crp,cri
+
+  crp=rp
+  cri=ri
+  do i=1,mxitr
+     temp=crp*crp-cri*cri+rp
+     cri=crp*cri*2.d0+ri
+     crp=temp
+     if(crp*crp+cri*cri.gt.4.d0) then
+        numoutside=numoutside+1
+        exit
+     endif
+  enddo
+
+  return
+end subroutine testpoint
--- a/matmul/matmul_cpp/Makefile
+++ b/matmul/matmul_cpp/Makefile
+CXX=g++
+CXXFLAGS=-O2 -Wall -fopenmp
+NVCC=nvcc
+NVCCFLAGS=-O2
+RUN=
+
+RANGE=64,128,256,512,1024#,2048,4096,8192
+NTHREADS=4
+
+N=64
+
+all: naive order threads target cuda
+
+naive:
+	$(CXX) $(CXXFLAGS) -o naive naive.cpp
+
+order:
+	$(CXX) $(CXXFLAGS) -o order order.cpp
+
+threads:
+	$(CXX) $(CXXFLAGS) -o threads threads.cpp
+
+target:
+	$(CXX) $(CXXFLAGS) -o target target.cpp
+
+cuda:
+	$(NVCC) $(NVCCFLAGS) -o cuda cuda.cu
+
+.PHONY: clean run test
+
+clean:
+	rm -f naive order factorized threads target cuda
+	rm -f time.*.txt
+
+run: clean all
+	@echo -n "naive:        " && $(RUN) ./naive $N
+	@echo -n "order:        " && $(RUN) ./order $N
+	@echo -n "threads:      " && $(RUN) ./threads $N
+	@echo -n "target:       " && $(RUN) ./target $N
+	@echo -n "cuda:         " && $(RUN) ./cuda $N
+
+test: clean all
+	rm -f time.*.txt
+	export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./naive $$i >> time.naive.txt; done
+	export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./order $$i >> time.order.txt; done
+	export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./threads $$i >> time.threads.txt; done
+	export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./target $$i >> time.target.txt; done
+	export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./cuda $$i >> time.cuda.txt; done
+
+plot:
+	python plot.py
\ No newline at end of file
--- a/matmul/matmul_cpp/cuda.cu
+++ b/matmul/matmul_cpp/cuda.cu
+#include <cstdio>
+#include "utils.hpp"
+
+__global__ void cuda_mul(float* m1, float* m2, float* m3, int n) {
+
+    int row = blockIdx.y*blockDim.y+threadIdx.y;
+    int col = blockIdx.x*blockDim.x+threadIdx.x;
+
+    for (int i = 0; i < n; i++)
+        m3[row * n + col] += m1[row * n + i] * m2[i * n + col];
+}
+
+void mul(const float *m1, const float *m2, float *m3, int n)
+{
+	float *dm1 = nullptr, *dm2 = nullptr, *dm3 = nullptr;
+
+	cudaMalloc((void**)&dm1, sizeof(float) * n * n);
+	cudaMalloc((void**)&dm2, sizeof(float) * n * n);
+	cudaMalloc((void**)&dm3, sizeof(float) * n * n);
+
+	cudaMemcpy(dm1, m1, sizeof(float) * n * n, cudaMemcpyHostToDevice);
+	cudaMemcpy(dm2, m2, sizeof(float) * n * n, cudaMemcpyHostToDevice);
+	cudaMemcpy(dm3, m3, sizeof(float) * n * n, cudaMemcpyHostToDevice);
+
+	dim3 blockSize = dim3(16, 16);
+	dim3 gridSize = dim3(n / blockSize.x, n/ blockSize.y);
+	
+	cuda_mul<<<gridSize, blockSize>>>(dm1, dm2, dm3, n);
+
+	cudaMemcpy(m3, dm3, sizeof(float) * n * n, cudaMemcpyDeviceToHost);
+
+	cudaFree(dm1);
+	cudaFree(dm2);
+	cudaFree(dm3);
+}
+
+int main(int argc, char **argv)
+{
+	int n = 64;
+	if(argc > 1)
+		n = atoi(argv[1]);
+
+	srand(12);
+
+	float *m1 = nullptr, *m2 = nullptr, *m3 = nullptr;
+
+	cudaError_t err = cudaMallocHost((void**)&m1, sizeof(float) * n * n);
+	if(err) printf("Error status is %s\n", cudaGetErrorString(err));
+	err = cudaMallocHost((void**)&m2, sizeof(float) * n * n);
+	if(err) printf("Error status is %s\n", cudaGetErrorString(err));
+	err = cudaMallocHost((void**)&m3, sizeof(float) * n * n);
+	if(err) printf("Error status is %s\n", cudaGetErrorString(err));
+
+	random(m1, n);
+	random(m2, n);
+	random(m3, n);
+
+	zeros(m3, n);
+
+	TimePoint tstart, tstop;
+
+	tstart = Clock::now();
+	mul(m1, m2, m3, n);
+	tstop = Clock::now();
+
+	//printf("checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop));
+	printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop));
+
+	cudaFreeHost(m1);
+	cudaFreeHost(m2);
+	cudaFreeHost(m3);
+}
--- a/matmul/matmul_cpp/naive.cpp
+++ b/matmul/matmul_cpp/naive.cpp
+#include <cstdio>
+#include "utils.hpp"
+
+void mul(const float *m1, const float *m2, float *m3, int n)
+{
+    for(int i=0; i<n; ++i)
+        for(int j=0; j<n; ++j)
+            for(int k=0; k<n; ++k)
+            	m3[i*n+j] += m1[i*n + k] * m2[k*n+j];
+}
+
+int main(int argc, char **argv)
+{
+	int n = 64;
+	if(argc > 1)
+		n = atoi(argv[1]);
+
+	srand(12);
+
+	float *m1 = new float[n*n];
+	float *m2 = new float[n*n];
+	float *m3 = new float[n*n];
+
+	random(m1, n);
+	random(m2, n);
+	random(m3, n);
+
+	zeros(m3, n);
+
+	TimePoint tstart, tstop;
+
+	tstart = Clock::now();
+	mul(m1, m2, m3, n);
+	tstop = Clock::now();
+
+	//printf("Checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop));
+	printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop));
+	
+	delete[] m1;
+	delete[] m2;
+	delete[] m3;
+}
--- a/matmul/matmul_cpp/order.cpp
+++ b/matmul/matmul_cpp/order.cpp
+#include <cstdio>
+#include "utils.hpp"
+
+void mul(const float *m1, const float *m2, float *m3, int n)
+{
+    for(int i=0; i<n; ++i)
+        for(int k=0; k<n; ++k)
+            for(int j=0; j<n; ++j)
+            	m3[ i * n + j] += m1[i * n + k] * m2[k * n + j];
+}
+
+int main(int argc, char **argv)
+{
+	int n = 64;
+	if(argc > 1)
+		n = atoi(argv[1]);
+
+	srand(12);
+
+	float *m1 = new float[n*n];
+	float *m2 = new float[n*n];
+	float *m3 = new float[n*n];
+
+	random(m1, n);
+	random(m2, n);
+	random(m3, n);
+
+	zeros(m3, n);
+
+	TimePoint tstart, tstop;
+
+	tstart = Clock::now();
+	mul(m1, m2, m3, n);
+	tstop = Clock::now();
+
+	//printf("Checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop));
+	printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop));
+	
+	delete[] m1;
+	delete[] m2;
+	delete[] m3;
+}
--- a/matmul/matmul_cpp/plot.py
+++ b/matmul/matmul_cpp/plot.py
+import os
+import math
+import numpy as np
+from matplotlib import pyplot as plt
+
+cuda = np.loadtxt('time.cuda.txt')
+target = np.loadtxt('time.target.txt')
+threads = np.loadtxt('time.threads.txt')
+order = np.loadtxt('time.order.txt')
+naive = np.loadtxt('time.naive.txt')
+
+l = len(naive)
+n = [64 * 2**i for i in range(0, l)]
+
+print(l, n)
+
+plt.figure(figsize=(5,4))
+if cuda.any(): plt.plot(n, cuda[:,1], marker='x', linestyle='-', linewidth=2, markersize=6, color='C0', label=r"CUDA")#($f_{child} = 2, n_{p} = 64$)")
+if target.any(): plt.plot(n, target[:,1], marker='v', linestyle='-', linewidth=2, markersize=6, color='C1', label=r"OpenMP Offloading")#($f_{child} = 2, n_{p} = 64$)")
+if threads.any(): plt.plot(n, threads[:,1], marker='o', linestyle='-', linewidth=2, markersize=6, color='C2', label=r"OpenMP Threading")#($f_{child} = 2, n_{p} = 64$)")
+if order.any(): plt.plot(n, order[:,1], marker='>', linestyle='-', linewidth=2, markersize=6, color='C4', label=r"Loop Order")#($f_{child} = 2, n_{p} = 64$)")
+if naive.any(): plt.plot(n, naive[:,1], marker='<', linestyle='-', linewidth=2, markersize=6, color='C5', label=r"Naive")#($f_{child} = 2, n_{p} = 64$)")
+plt.legend(loc='upper left')
+plt.xscale('log')
+plt.yscale('log')
+plt.xlim(n[0], n[-1])
+plt.xticks(n, n)
+plt.yticks()
+plt.draw()
+
+plt.show()
\ No newline at end of file
--- a/matmul/matmul_cpp/target.cpp
+++ b/matmul/matmul_cpp/target.cpp
+#include <cstdio>
+#include "utils.hpp"
+
+void mul(const float *m1, const float *m2, float *m3, int n)
+{
+	const int n2 = n * n;
+
+	#pragma omp target map(to: m1[0:n2], m2[0:n2]), map(tofrom: m3[0:n2])
+	#pragma omp teams distribute parallel for collapse(2)
+	for(int i=0; i<n; ++i)
+	{
+        for(int j=0; j<n; ++j)
+        {
+        	for (int k = 0; k < n; k++)
+        		m3[i * n + j] += m1[i * n + k] * m2[k * n + j];
+        }
+    }
+}
+
+int main(int argc, char **argv)
+{
+	int n = 64;
+	if(argc > 1)
+		n = atoi(argv[1]);
+
+	srand(12);
+
+	float *m1 = new float[n*n];
+	float *m2 = new float[n*n];
+	float *m3 = new float[n*n];
+
+	random(m1, n);
+	random(m2, n);
+	random(m3, n);
+
+	zeros(m3, n);
+
+	TimePoint tstart, tstop;
+
+	tstart = Clock::now();
+	mul(m1, m2, m3, n);
+	tstop = Clock::now();
+
+	//printf("Checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop));
+	printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop));
+	
+	delete[] m1;
+	delete[] m2;
+	delete[] m3;
+}
--- a/matmul/matmul_cpp/threads.cpp
+++ b/matmul/matmul_cpp/threads.cpp
+#include <cstdio>
+#include "utils.hpp"
+
+void mul(const float *m1, const float *m2, float *m3, int n)
+{
+	#pragma omp parallel for
+   	for(int i=0; i<n; ++i)
+    	for(int k=0; k<n; ++k)
+        	for(int j=0; j<n; ++j)
+        		m3[ i * n + j] += m1[i * n + k] * m2[k * n + j];
+}
+
+int main(int argc, char **argv)
+{
+	int n = 64;
+	if(argc > 1)
+		n = atoi(argv[1]);
+
+	srand(12);
+
+	float *m1 = new float[n*n];
+	float *m2 = new float[n*n];
+	float *m3 = new float[n*n];
+
+	random(m1, n);
+	random(m2, n);
+	random(m3, n);
+
+	zeros(m3, n);
+
+	TimePoint tstart, tstop;
+
+	tstart = Clock::now();
+	mul(m1, m2, m3, n);
+	tstop = Clock::now();
+
+	//printf("Checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop));
+	printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop));
+
+	delete[] m1;
+	delete[] m2;
+	delete[] m3;
+}
--- a/matmul/matmul_cpp/utils.hpp
+++ b/matmul/matmul_cpp/utils.hpp
+#include <ctime>
+#include <cstdlib>
+#include <chrono>
+
+typedef std::chrono::high_resolution_clock Clock;
+typedef std::chrono::time_point<Clock> TimePoint;
+typedef std::chrono::duration<double> Time;
+
+void zeros(float *m, int n)
+{
+	for(int i=0; i<n*n; ++i)
+		m[i] = 0;
+}
+
+void random(float *m, int n)
+{
+	for(int i=0; i<n*n; ++i)
+		m[i] = (rand() % 1000 - 500) / 1000.0;
+}
+
+float checksum(const float *m3, int n)
+{
+	float sum = 0.0f;
+	for(int i=0; i<n*n; i++)
+		sum += m3[i];
+	return sum;
+}
+
+double elapsedTime(TimePoint start, TimePoint stop)
+{
+	return std::chrono::duration_cast<Time>(stop-start).count();
+}
--- a/matmul/matmul_f90/mat_mul.f90
+++ b/matmul/matmul_f90/mat_mul.f90
+      program matmul_prog
+
+        implicit none
+        
+        integer icrow,iccol
+        integer ic,i,j
+        integer,parameter :: a_row=840, a_col=3230, a_elem=a_row*a_col
+        integer,parameter :: b_row=3230, b_col=802, b_elem=b_row*b_col
+        integer,parameter :: c_row=a_row, c_col=b_col, c_elem=c_row*c_col        
+
+        integer, dimension(12) :: seed=(/3,6,1,2,8,4,2,9,0,4,5,6/)
+              
+        double precision, dimension(a_row,a_col) ::  a
+        double precision, dimension(b_row,b_col) ::  b
+        double precision, dimension(c_row,c_col) ::  c,c_ref
+        double precision, dimension(a_elem) :: a_temp
+        double precision, dimension(b_elem) :: b_temp
+
+        double precision num,sum,t1,t2
+
+        logical, parameter :: validation=.false.
+        logical, parameter :: print_result=.false.
+
+        if(a_col.ne.b_row) stop 'Wrong matrices dimensions'
+                
+        !generate initial arrays
+        call random_seed(put=seed)
+        
+        do i=1,a_elem
+           call random_number(num)
+           a_temp(i)=10.d0*num+1.d0
+        enddo
+        do i=1,b_elem
+           call random_number(num)
+           b_temp(i)=10.d0*num+1.d0
+        enddo
+        a = transpose(reshape(a_temp,(/ size(a, 2), size(a, 1) /)))
+        b = transpose(reshape(b_temp,(/ size(b, 2), size(b, 1) /)))       
+
+
+        call cpu_time(t1)
+        !matrix multiplication
+        c=0.d0
+        do ic=1,c_elem
+           icrow=mod(ic-1,c_row)+1
+           iccol=(ic-1)/c_row+1
+           do i=1,a_col
+              c(icrow,iccol)=c(icrow,iccol)+a(icrow,i)*b(i,iccol)
+           enddo
+        enddo
+        call cpu_time(t2)
+
+        print *,'Elapsed time:',t2-t1
+
+        !Print result
+        if(print_result) then
+           do icrow=1,c_row
+              print '(20(1x,f12.5))', (c(icrow,iccol),iccol=1,c_col)
+           enddo
+        endif
+
+        
+        !Validation
+
+        if(validation) then
+           c_ref=matmul(a,b)
+
+           do i=1,c_row
+              do j=1,c_col
+                 sum=sum+c(i,j)-c_ref(i,j)
+              enddo
+           enddo
+           print *,sum,sum/dble(c_elem)
+        endif
+
+      end program matmul_prog
+           
+              
+
+  
--- a/matmul/matmul_f90/mat_mul_gpu.f90
+++ b/matmul/matmul_f90/mat_mul_gpu.f90
+      program matmul_prog
+
+        use omp_lib
+        
+        implicit none
+        
+        integer icrow,iccol
+        integer ic,i,j,num_devices
+        integer,parameter :: a_row=840, a_col=3230, a_elem=a_row*a_col
+        integer,parameter :: b_row=3230, b_col=802, b_elem=b_row*b_col
+        integer,parameter :: c_row=a_row, c_col=b_col, c_elem=c_row*c_col        
+
+        integer, dimension(12) :: seed=(/3,6,1,2,8,4,2,9,0,4,5,6/)
+              
+        double precision, dimension(a_row,a_col) ::  a
+        double precision, dimension(b_row,b_col) ::  b
+        double precision, dimension(c_row,c_col) ::  c,c_ref
+        double precision, dimension(a_elem) :: a_temp
+        double precision, dimension(b_elem) :: b_temp
+
+        double precision num,sum,t1,t2
+
+        logical, parameter :: validation=.true.
+        logical, parameter :: print_result=.false.
+
+        if(a_col.ne.b_row) stop 'Wrong matrices dimensions'
+
+        num_devices = omp_get_num_devices()
+        
+        !generate initial arrays
+        call random_seed(put=seed)
+        
+        do i=1,a_elem
+           call random_number(num)
+           a_temp(i)=10.d0*num+1.d0
+        enddo
+        do i=1,b_elem
+           call random_number(num)
+           b_temp(i)=10.d0*num+1.d0
+        enddo
+        a = transpose(reshape(a_temp,(/ size(a, 2), size(a, 1) /)))
+        b = transpose(reshape(b_temp,(/ size(b, 2), size(b, 1) /)))
+
+
+        t1=OMP_GET_WTIME()
+        !matrix multiplication
+        c=0.d0
+        !$omp target map(tofrom:c) map(to:a,b)
+        !$omp teams distribute parallel do private(ic,icrow,iccol,i)
+        do ic=1,c_elem
+           icrow=mod(ic-1,c_row)+1
+           iccol=(ic-1)/c_row+1
+           do i=1,a_col
+              c(icrow,iccol)=c(icrow,iccol)+a(icrow,i)*b(i,iccol)
+           enddo
+        enddo
+        !$omp end target
+        t2=OMP_GET_WTIME()
+        
+        print *,'Elapsed time:',t2-t1
+
+        !Print result
+        if(print_result) then
+           do icrow=1,c_row
+              print '(20(1x,f12.5))', (c(icrow,iccol),iccol=1,c_col)
+           enddo
+        endif
+
+        
+        !Validation
+
+        if(validation) then
+           c_ref=matmul(a,b)
+
+           do i=1,c_row
+              do j=1,c_col
+                 sum=sum+c(i,j)-c_ref(i,j)
+              enddo
+           enddo
+           print *,sum,sum/dble(c_elem)
+        endif
+
+      end program matmul_prog
+           
+              
+
+  
--- a/matmul/matmul_f90/mat_mul_omp.f90
+++ b/matmul/matmul_f90/mat_mul_omp.f90
+      program matmul_prog
+
+        implicit none
+        
+        integer icrow,iccol
+        integer ic,i,j
+        integer,parameter :: a_row=840, a_col=3230, a_elem=a_row*a_col
+        integer,parameter :: b_row=3230, b_col=802, b_elem=b_row*b_col
+        integer,parameter :: c_row=a_row, c_col=b_col, c_elem=c_row*c_col        
+
+        integer, dimension(12) :: seed=(/3,6,1,2,8,4,2,9,0,4,5,6/)
+              
+        double precision, dimension(a_row,a_col) ::  a
+        double precision, dimension(b_row,b_col) ::  b
+        double precision, dimension(c_row,c_col) ::  c,c_ref
+        double precision, dimension(a_elem) :: a_temp
+        double precision, dimension(b_elem) :: b_temp
+
+        double precision num,sum,t1,t2,OMP_GET_WTIME
+
+        logical, parameter :: validation=.true.
+        logical, parameter :: print_result=.false.
+
+        if(a_col.ne.b_row) stop 'Wrong matrices dimensions'
+                
+        !generate initial arrays
+        call random_seed(put=seed)
+        
+        do i=1,a_elem
+           call random_number(num)
+           a_temp(i)=10.d0*num+1.d0
+        enddo
+        do i=1,b_elem
+           call random_number(num)
+           b_temp(i)=10.d0*num+1.d0
+        enddo
+        a = transpose(reshape(a_temp,(/ size(a, 2), size(a, 1) /)))
+        b = transpose(reshape(b_temp,(/ size(b, 2), size(b, 1) /)))
+
+
+        t1=OMP_GET_WTIME()
+        !matrix multiplication
+        c=0.d0
+        !$omp parallel private(ic,icrow,iccol,i)
+        !$omp do schedule(static)
+        do ic=1,c_elem
+           icrow=mod(ic-1,c_row)+1
+           iccol=(ic-1)/c_row+1
+           do i=1,a_col
+              c(icrow,iccol)=c(icrow,iccol)+a(icrow,i)*b(i,iccol)
+           enddo
+        enddo
+        !$omp end do
+        !$omp end parallel
+        t2=OMP_GET_WTIME()
+        
+        print *,'Elapsed time:',t2-t1
+
+        !Print result
+        if(print_result) then
+           do icrow=1,c_row
+              print '(20(1x,f12.5))', (c(icrow,iccol),iccol=1,c_col)
+           enddo
+        endif
+
+        
+        !Validation
+
+        if(validation) then
+           c_ref=matmul(a,b)
+
+           do i=1,c_row
+              do j=1,c_col
+                 sum=sum+c(i,j)-c_ref(i,j)
+              enddo
+           enddo
+           print *,sum,sum/dble(c_elem)
+        endif
+
+      end program matmul_prog
+           
+              
+
+  
--- a/matmul/matmul_python/mat_mul_python_gpu.py
+++ b/matmul/matmul_python/mat_mul_python_gpu.py
+import numpy as np
+import time
+from numpy.random import seed
+from numpy.random import rand
+from numba import jit,cuda
+
+size_array=4096
+size_vec=size_array*size_array
+validate=True
+
+def generate_array(num,size):
+    seed(num)
+    lst=10.*rand(size)+1.
+    return lst
+
+
+a=np.array(generate_array(1,size_vec))
+b=np.array(generate_array(2,size_vec))
+c=np.zeros(size_vec)
+
+@cuda.jit
+def mat_mul(a,b,c,size):
+    row=cuda.blockIdx.y*cuda.blockDim.y+cuda.threadIdx.y
+    col=cuda.blockIdx.x*cuda.blockDim.x+cuda.threadIdx.x
+    for i in range(size):
+        c[row*size+col]+=a[row*size+i]*b[i*size+col]
+
+
+threadsperblock = (16,16)
+blockspergrid_x = int(np.ceil(size_array / threadsperblock[0]))
+blockspergrid_y = int(np.ceil(size_array / threadsperblock[1]))
+blockspergrid = (blockspergrid_x, blockspergrid_y)
+start=time.time()
+mat_mul[blockspergrid, threadsperblock](a,b,c,size_array)
+end=time.time()
+print('Elapsed time: ',end-start)
+
+c=np.zeros(size_vec)
+start=time.time()
+mat_mul[blockspergrid, threadsperblock](a,b,c,size_array)
+end=time.time()
+print('Elapsed time: ',end-start)
+  
+if(validate):
+    a2d=np.reshape(a,(size_array,-1))
+    b2d=np.reshape(b,(size_array,-1))
+    c2d_numba=np.reshape(c,(size_array,-1))
+    c2d=np.matmul(a2d,b2d)
+    dif=np.abs(c2d_numba-c2d)
+    summation=np.sum(dif)
+    print(summation,summation/float(size_vec))
--- a/matmul/matmul_python/mat_mul_python_numba.py
+++ b/matmul/matmul_python/mat_mul_python_numba.py
+import numpy as np
+import time
+from numpy.random import seed
+from numpy.random import rand
+from numba import jit,prange
+
+size_array=4096
+size_vec=size_array*size_array
+
+def generate_array(num,size):
+    seed(num)
+    lst=10.*rand(size)+1.
+    return lst
+
+
+a=np.array(generate_array(1,size_vec))
+b=np.array(generate_array(2,size_vec))
+c=np.zeros(size_vec)
+
+@jit
+def mat_mul(a,b,c,size):
+    for i in range(size):
+        for k in range(size):
+            for j in range(size):
+                c[i*size+j]+=a[i*size+k]+b[k*size+j]
+
+start=time.time()
+c=mat_mul(a,b,c,size_array)
+end=time.time()
+print('Elapsed time: ',end-start)
+
+c=np.zeros(size_vec)
+
+start=time.time()
+c=mat_mul(a,b,c,size_array)
+end=time.time()
+
+
+
+print('Elapsed time: ',end-start)
+  
--- a/matmul/matmul_python/mat_mul_python_threads.py
+++ b/matmul/matmul_python/mat_mul_python_threads.py
+import numpy as np
+import time
+from numpy.random import seed
+from numpy.random import rand
+from numba import jit,prange
+
+size_array=4096
+size_vec=size_array*size_array
+
+def generate_array(num,size):
+    seed(num)
+    lst=10.*rand(size)+1.
+    return lst
+
+
+a=np.array(generate_array(1,size_vec))
+b=np.array(generate_array(2,size_vec))
+c=np.zeros(size_vec)
+
+@jit(parallel=True)
+def mat_mul(a,b,c,size):
+    for i in prange(size):
+        for k in range(size):
+            for j in range(size):
+                c[i*size+j]+=a[i*size+k]+b[k*size+j]
+
+start=time.time()
+c=mat_mul(a,b,c,size_array)
+end=time.time()
+print('Elapsed time: ',end-start)
+
+c=np.zeros(size_vec)
+
+start=time.time()
+c=mat_mul(a,b,c,size_array)
+end=time.time()
+
+
+
+print('Elapsed time: ',end-start)
+  
--- a/montecarlo_pi.f90
+++ b/montecarlo_pi.f90
+program montecarlo_pi
+
+  implicit none
+
+  integer i,countin
+  integer,parameter::npoints=100000000
+  double precision num,pi,pi_true,x,y,random_last
+
+  pi_true=acos(-1.d0)
+  countin=0
+  random_last=0
+  do i=1,npoints
+     call random_number(x)
+     call random_number(y)
+     if(x*x+y*y.le.1.d0)countin=countin+1
+  enddo
+
+  pi=dble(countin)/dble(npoints)*4.d0
+
+  print *,'Calculated: ',pi,'Real: ',pi_true
+
+end program montecarlo_pi
--- a/montecarlo_pi_v2.f90
+++ b/montecarlo_pi_v2.f90
+program montecarlo_pi_v2
+
+  implicit none
+
+  integer i,countin,id
+  integer,parameter::npoints=100000000
+  double precision num,pi,pi_true,x,y
+  integer random_last
+
+  pi_true=acos(-1.d0)
+  countin=0
+  random_last=0
+  do i=1,npoints
+     call LCG_random(x,random_last)
+     call LCG_random(y,random_last)
+     if(x*x+y*y.le.1.d0)countin=countin+1
+  enddo
+
+  pi=dble(countin)/dble(npoints)*4.d0
+
+  print *,pi,pi_true
+
+end program montecarlo_pi_v2
+
+
+!***************************************************
+subroutine LCG_random(num,random_last)
+
+  implicit none
+  integer,parameter::multiplier=1366,addend=150889,pmod=714025
+  integer,intent(inout)::random_last
+  double precision num
+
+  num=dble(mod(multiplier*random_last+addend,pmod))
+  random_last=num
+  num=num/dble(pmod)
+
+  return
+end subroutine LCG_random
--- a/openmp_2020_print.pdf
+++ b/openmp_2020_print.pdf
--- a/pi.f90
+++ b/pi.f90
+program pi_serial
+
+  implicit none
+
+
+  integer, parameter::steps=1000000000
+  integer i
+  double precision dx,x,sum,pi
+  
+  pi=acos(-1.d0)
+  dx=1.d0/dble(steps)
+  sum=0.d0
+
+  do i=1,steps
+     x=(dble(i)-.5d0)*dx
+     sum=sum+4.d0/(1.d0+x*x)
+  enddo
+
+  sum=sum*dx
+
+  print *,sum,abs(pi-sum)/pi
+
+end program pi_serial