Skip to content
Snippets Groups Projects
Commit a707aab2 authored by Ruben's avatar Ruben
Browse files

course 2020

parent 2b021eb1
Branches
No related tags found
No related merge requests found
Showing
with 883 additions and 0 deletions
File added
program mandelbrot_area
implicit none
integer,parameter::npoints=2000,mxitr=1000
double precision,parameter:: eps=1.d-5
integer i,j,numoutside
double precision area,error,rp,ri
numoutside=0
print *,eps,numoutside
do i=1,npoints
do j=1,npoints
rp=-2.d0+2.5d0*dble(i)/dble(npoints)+eps
ri=1.125d0*dble(j)/dble(npoints)+eps
call testpoint(rp,ri,numoutside,mxitr)
enddo
enddo
area=2.d0*2.5d0*1.125d0*dble(npoints*npoints-numoutside)/dble(npoints*npoints)
error=area/dble(npoints)
print *,'area: ',area,numoutside
print *,'error: ',error
end program mandelbrot_area
!***************************************************
subroutine testpoint(rp,ri,numoutside,mxitr)
implicit none
integer i
integer,intent(in)::mxitr
integer,intent(inout)::numoutside
double precision,intent(in)::rp,ri
double precision temp,crp,cri
crp=rp
cri=ri
do i=1,mxitr
temp=crp*crp-cri*cri+rp
cri=crp*cri*2.d0+ri
crp=temp
if(crp*crp+cri*cri.gt.4.d0) then
numoutside=numoutside+1
exit
endif
enddo
return
end subroutine testpoint
CXX=g++
CXXFLAGS=-O2 -Wall -fopenmp
NVCC=nvcc
NVCCFLAGS=-O2
RUN=
RANGE=64,128,256,512,1024#,2048,4096,8192
NTHREADS=4
N=64
all: naive order threads target cuda
naive:
$(CXX) $(CXXFLAGS) -o naive naive.cpp
order:
$(CXX) $(CXXFLAGS) -o order order.cpp
threads:
$(CXX) $(CXXFLAGS) -o threads threads.cpp
target:
$(CXX) $(CXXFLAGS) -o target target.cpp
cuda:
$(NVCC) $(NVCCFLAGS) -o cuda cuda.cu
.PHONY: clean run test
clean:
rm -f naive order factorized threads target cuda
rm -f time.*.txt
run: clean all
@echo -n "naive: " && $(RUN) ./naive $N
@echo -n "order: " && $(RUN) ./order $N
@echo -n "threads: " && $(RUN) ./threads $N
@echo -n "target: " && $(RUN) ./target $N
@echo -n "cuda: " && $(RUN) ./cuda $N
test: clean all
rm -f time.*.txt
export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./naive $$i >> time.naive.txt; done
export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./order $$i >> time.order.txt; done
export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./threads $$i >> time.threads.txt; done
export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./target $$i >> time.target.txt; done
export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./cuda $$i >> time.cuda.txt; done
plot:
python plot.py
\ No newline at end of file
#include <cstdio>
#include "utils.hpp"
__global__ void cuda_mul(float* m1, float* m2, float* m3, int n) {
int row = blockIdx.y*blockDim.y+threadIdx.y;
int col = blockIdx.x*blockDim.x+threadIdx.x;
for (int i = 0; i < n; i++)
m3[row * n + col] += m1[row * n + i] * m2[i * n + col];
}
void mul(const float *m1, const float *m2, float *m3, int n)
{
float *dm1 = nullptr, *dm2 = nullptr, *dm3 = nullptr;
cudaMalloc((void**)&dm1, sizeof(float) * n * n);
cudaMalloc((void**)&dm2, sizeof(float) * n * n);
cudaMalloc((void**)&dm3, sizeof(float) * n * n);
cudaMemcpy(dm1, m1, sizeof(float) * n * n, cudaMemcpyHostToDevice);
cudaMemcpy(dm2, m2, sizeof(float) * n * n, cudaMemcpyHostToDevice);
cudaMemcpy(dm3, m3, sizeof(float) * n * n, cudaMemcpyHostToDevice);
dim3 blockSize = dim3(16, 16);
dim3 gridSize = dim3(n / blockSize.x, n/ blockSize.y);
cuda_mul<<<gridSize, blockSize>>>(dm1, dm2, dm3, n);
cudaMemcpy(m3, dm3, sizeof(float) * n * n, cudaMemcpyDeviceToHost);
cudaFree(dm1);
cudaFree(dm2);
cudaFree(dm3);
}
int main(int argc, char **argv)
{
int n = 64;
if(argc > 1)
n = atoi(argv[1]);
srand(12);
float *m1 = nullptr, *m2 = nullptr, *m3 = nullptr;
cudaError_t err = cudaMallocHost((void**)&m1, sizeof(float) * n * n);
if(err) printf("Error status is %s\n", cudaGetErrorString(err));
err = cudaMallocHost((void**)&m2, sizeof(float) * n * n);
if(err) printf("Error status is %s\n", cudaGetErrorString(err));
err = cudaMallocHost((void**)&m3, sizeof(float) * n * n);
if(err) printf("Error status is %s\n", cudaGetErrorString(err));
random(m1, n);
random(m2, n);
random(m3, n);
zeros(m3, n);
TimePoint tstart, tstop;
tstart = Clock::now();
mul(m1, m2, m3, n);
tstop = Clock::now();
//printf("checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop));
printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop));
cudaFreeHost(m1);
cudaFreeHost(m2);
cudaFreeHost(m3);
}
#include <cstdio>
#include "utils.hpp"
void mul(const float *m1, const float *m2, float *m3, int n)
{
for(int i=0; i<n; ++i)
for(int j=0; j<n; ++j)
for(int k=0; k<n; ++k)
m3[i*n+j] += m1[i*n + k] * m2[k*n+j];
}
int main(int argc, char **argv)
{
int n = 64;
if(argc > 1)
n = atoi(argv[1]);
srand(12);
float *m1 = new float[n*n];
float *m2 = new float[n*n];
float *m3 = new float[n*n];
random(m1, n);
random(m2, n);
random(m3, n);
zeros(m3, n);
TimePoint tstart, tstop;
tstart = Clock::now();
mul(m1, m2, m3, n);
tstop = Clock::now();
//printf("Checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop));
printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop));
delete[] m1;
delete[] m2;
delete[] m3;
}
#include <cstdio>
#include "utils.hpp"
void mul(const float *m1, const float *m2, float *m3, int n)
{
for(int i=0; i<n; ++i)
for(int k=0; k<n; ++k)
for(int j=0; j<n; ++j)
m3[ i * n + j] += m1[i * n + k] * m2[k * n + j];
}
int main(int argc, char **argv)
{
int n = 64;
if(argc > 1)
n = atoi(argv[1]);
srand(12);
float *m1 = new float[n*n];
float *m2 = new float[n*n];
float *m3 = new float[n*n];
random(m1, n);
random(m2, n);
random(m3, n);
zeros(m3, n);
TimePoint tstart, tstop;
tstart = Clock::now();
mul(m1, m2, m3, n);
tstop = Clock::now();
//printf("Checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop));
printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop));
delete[] m1;
delete[] m2;
delete[] m3;
}
import os
import math
import numpy as np
from matplotlib import pyplot as plt
cuda = np.loadtxt('time.cuda.txt')
target = np.loadtxt('time.target.txt')
threads = np.loadtxt('time.threads.txt')
order = np.loadtxt('time.order.txt')
naive = np.loadtxt('time.naive.txt')
l = len(naive)
n = [64 * 2**i for i in range(0, l)]
print(l, n)
plt.figure(figsize=(5,4))
if cuda.any(): plt.plot(n, cuda[:,1], marker='x', linestyle='-', linewidth=2, markersize=6, color='C0', label=r"CUDA")#($f_{child} = 2, n_{p} = 64$)")
if target.any(): plt.plot(n, target[:,1], marker='v', linestyle='-', linewidth=2, markersize=6, color='C1', label=r"OpenMP Offloading")#($f_{child} = 2, n_{p} = 64$)")
if threads.any(): plt.plot(n, threads[:,1], marker='o', linestyle='-', linewidth=2, markersize=6, color='C2', label=r"OpenMP Threading")#($f_{child} = 2, n_{p} = 64$)")
if order.any(): plt.plot(n, order[:,1], marker='>', linestyle='-', linewidth=2, markersize=6, color='C4', label=r"Loop Order")#($f_{child} = 2, n_{p} = 64$)")
if naive.any(): plt.plot(n, naive[:,1], marker='<', linestyle='-', linewidth=2, markersize=6, color='C5', label=r"Naive")#($f_{child} = 2, n_{p} = 64$)")
plt.legend(loc='upper left')
plt.xscale('log')
plt.yscale('log')
plt.xlim(n[0], n[-1])
plt.xticks(n, n)
plt.yticks()
plt.draw()
plt.show()
\ No newline at end of file
#include <cstdio>
#include "utils.hpp"
void mul(const float *m1, const float *m2, float *m3, int n)
{
const int n2 = n * n;
#pragma omp target map(to: m1[0:n2], m2[0:n2]), map(tofrom: m3[0:n2])
#pragma omp teams distribute parallel for collapse(2)
for(int i=0; i<n; ++i)
{
for(int j=0; j<n; ++j)
{
for (int k = 0; k < n; k++)
m3[i * n + j] += m1[i * n + k] * m2[k * n + j];
}
}
}
int main(int argc, char **argv)
{
int n = 64;
if(argc > 1)
n = atoi(argv[1]);
srand(12);
float *m1 = new float[n*n];
float *m2 = new float[n*n];
float *m3 = new float[n*n];
random(m1, n);
random(m2, n);
random(m3, n);
zeros(m3, n);
TimePoint tstart, tstop;
tstart = Clock::now();
mul(m1, m2, m3, n);
tstop = Clock::now();
//printf("Checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop));
printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop));
delete[] m1;
delete[] m2;
delete[] m3;
}
#include <cstdio>
#include "utils.hpp"
void mul(const float *m1, const float *m2, float *m3, int n)
{
#pragma omp parallel for
for(int i=0; i<n; ++i)
for(int k=0; k<n; ++k)
for(int j=0; j<n; ++j)
m3[ i * n + j] += m1[i * n + k] * m2[k * n + j];
}
int main(int argc, char **argv)
{
int n = 64;
if(argc > 1)
n = atoi(argv[1]);
srand(12);
float *m1 = new float[n*n];
float *m2 = new float[n*n];
float *m3 = new float[n*n];
random(m1, n);
random(m2, n);
random(m3, n);
zeros(m3, n);
TimePoint tstart, tstop;
tstart = Clock::now();
mul(m1, m2, m3, n);
tstop = Clock::now();
//printf("Checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop));
printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop));
delete[] m1;
delete[] m2;
delete[] m3;
}
#include <ctime>
#include <cstdlib>
#include <chrono>
typedef std::chrono::high_resolution_clock Clock;
typedef std::chrono::time_point<Clock> TimePoint;
typedef std::chrono::duration<double> Time;
void zeros(float *m, int n)
{
for(int i=0; i<n*n; ++i)
m[i] = 0;
}
void random(float *m, int n)
{
for(int i=0; i<n*n; ++i)
m[i] = (rand() % 1000 - 500) / 1000.0;
}
float checksum(const float *m3, int n)
{
float sum = 0.0f;
for(int i=0; i<n*n; i++)
sum += m3[i];
return sum;
}
double elapsedTime(TimePoint start, TimePoint stop)
{
return std::chrono::duration_cast<Time>(stop-start).count();
}
program matmul_prog
implicit none
integer icrow,iccol
integer ic,i,j
integer,parameter :: a_row=840, a_col=3230, a_elem=a_row*a_col
integer,parameter :: b_row=3230, b_col=802, b_elem=b_row*b_col
integer,parameter :: c_row=a_row, c_col=b_col, c_elem=c_row*c_col
integer, dimension(12) :: seed=(/3,6,1,2,8,4,2,9,0,4,5,6/)
double precision, dimension(a_row,a_col) :: a
double precision, dimension(b_row,b_col) :: b
double precision, dimension(c_row,c_col) :: c,c_ref
double precision, dimension(a_elem) :: a_temp
double precision, dimension(b_elem) :: b_temp
double precision num,sum,t1,t2
logical, parameter :: validation=.false.
logical, parameter :: print_result=.false.
if(a_col.ne.b_row) stop 'Wrong matrices dimensions'
!generate initial arrays
call random_seed(put=seed)
do i=1,a_elem
call random_number(num)
a_temp(i)=10.d0*num+1.d0
enddo
do i=1,b_elem
call random_number(num)
b_temp(i)=10.d0*num+1.d0
enddo
a = transpose(reshape(a_temp,(/ size(a, 2), size(a, 1) /)))
b = transpose(reshape(b_temp,(/ size(b, 2), size(b, 1) /)))
call cpu_time(t1)
!matrix multiplication
c=0.d0
do ic=1,c_elem
icrow=mod(ic-1,c_row)+1
iccol=(ic-1)/c_row+1
do i=1,a_col
c(icrow,iccol)=c(icrow,iccol)+a(icrow,i)*b(i,iccol)
enddo
enddo
call cpu_time(t2)
print *,'Elapsed time:',t2-t1
!Print result
if(print_result) then
do icrow=1,c_row
print '(20(1x,f12.5))', (c(icrow,iccol),iccol=1,c_col)
enddo
endif
!Validation
if(validation) then
c_ref=matmul(a,b)
do i=1,c_row
do j=1,c_col
sum=sum+c(i,j)-c_ref(i,j)
enddo
enddo
print *,sum,sum/dble(c_elem)
endif
end program matmul_prog
program matmul_prog
use omp_lib
implicit none
integer icrow,iccol
integer ic,i,j,num_devices
integer,parameter :: a_row=840, a_col=3230, a_elem=a_row*a_col
integer,parameter :: b_row=3230, b_col=802, b_elem=b_row*b_col
integer,parameter :: c_row=a_row, c_col=b_col, c_elem=c_row*c_col
integer, dimension(12) :: seed=(/3,6,1,2,8,4,2,9,0,4,5,6/)
double precision, dimension(a_row,a_col) :: a
double precision, dimension(b_row,b_col) :: b
double precision, dimension(c_row,c_col) :: c,c_ref
double precision, dimension(a_elem) :: a_temp
double precision, dimension(b_elem) :: b_temp
double precision num,sum,t1,t2
logical, parameter :: validation=.true.
logical, parameter :: print_result=.false.
if(a_col.ne.b_row) stop 'Wrong matrices dimensions'
num_devices = omp_get_num_devices()
!generate initial arrays
call random_seed(put=seed)
do i=1,a_elem
call random_number(num)
a_temp(i)=10.d0*num+1.d0
enddo
do i=1,b_elem
call random_number(num)
b_temp(i)=10.d0*num+1.d0
enddo
a = transpose(reshape(a_temp,(/ size(a, 2), size(a, 1) /)))
b = transpose(reshape(b_temp,(/ size(b, 2), size(b, 1) /)))
t1=OMP_GET_WTIME()
!matrix multiplication
c=0.d0
!$omp target map(tofrom:c) map(to:a,b)
!$omp teams distribute parallel do private(ic,icrow,iccol,i)
do ic=1,c_elem
icrow=mod(ic-1,c_row)+1
iccol=(ic-1)/c_row+1
do i=1,a_col
c(icrow,iccol)=c(icrow,iccol)+a(icrow,i)*b(i,iccol)
enddo
enddo
!$omp end target
t2=OMP_GET_WTIME()
print *,'Elapsed time:',t2-t1
!Print result
if(print_result) then
do icrow=1,c_row
print '(20(1x,f12.5))', (c(icrow,iccol),iccol=1,c_col)
enddo
endif
!Validation
if(validation) then
c_ref=matmul(a,b)
do i=1,c_row
do j=1,c_col
sum=sum+c(i,j)-c_ref(i,j)
enddo
enddo
print *,sum,sum/dble(c_elem)
endif
end program matmul_prog
program matmul_prog
implicit none
integer icrow,iccol
integer ic,i,j
integer,parameter :: a_row=840, a_col=3230, a_elem=a_row*a_col
integer,parameter :: b_row=3230, b_col=802, b_elem=b_row*b_col
integer,parameter :: c_row=a_row, c_col=b_col, c_elem=c_row*c_col
integer, dimension(12) :: seed=(/3,6,1,2,8,4,2,9,0,4,5,6/)
double precision, dimension(a_row,a_col) :: a
double precision, dimension(b_row,b_col) :: b
double precision, dimension(c_row,c_col) :: c,c_ref
double precision, dimension(a_elem) :: a_temp
double precision, dimension(b_elem) :: b_temp
double precision num,sum,t1,t2,OMP_GET_WTIME
logical, parameter :: validation=.true.
logical, parameter :: print_result=.false.
if(a_col.ne.b_row) stop 'Wrong matrices dimensions'
!generate initial arrays
call random_seed(put=seed)
do i=1,a_elem
call random_number(num)
a_temp(i)=10.d0*num+1.d0
enddo
do i=1,b_elem
call random_number(num)
b_temp(i)=10.d0*num+1.d0
enddo
a = transpose(reshape(a_temp,(/ size(a, 2), size(a, 1) /)))
b = transpose(reshape(b_temp,(/ size(b, 2), size(b, 1) /)))
t1=OMP_GET_WTIME()
!matrix multiplication
c=0.d0
!$omp parallel private(ic,icrow,iccol,i)
!$omp do schedule(static)
do ic=1,c_elem
icrow=mod(ic-1,c_row)+1
iccol=(ic-1)/c_row+1
do i=1,a_col
c(icrow,iccol)=c(icrow,iccol)+a(icrow,i)*b(i,iccol)
enddo
enddo
!$omp end do
!$omp end parallel
t2=OMP_GET_WTIME()
print *,'Elapsed time:',t2-t1
!Print result
if(print_result) then
do icrow=1,c_row
print '(20(1x,f12.5))', (c(icrow,iccol),iccol=1,c_col)
enddo
endif
!Validation
if(validation) then
c_ref=matmul(a,b)
do i=1,c_row
do j=1,c_col
sum=sum+c(i,j)-c_ref(i,j)
enddo
enddo
print *,sum,sum/dble(c_elem)
endif
end program matmul_prog
import numpy as np
import time
from numpy.random import seed
from numpy.random import rand
from numba import jit,cuda
size_array=4096
size_vec=size_array*size_array
validate=True
def generate_array(num,size):
seed(num)
lst=10.*rand(size)+1.
return lst
a=np.array(generate_array(1,size_vec))
b=np.array(generate_array(2,size_vec))
c=np.zeros(size_vec)
@cuda.jit
def mat_mul(a,b,c,size):
row=cuda.blockIdx.y*cuda.blockDim.y+cuda.threadIdx.y
col=cuda.blockIdx.x*cuda.blockDim.x+cuda.threadIdx.x
for i in range(size):
c[row*size+col]+=a[row*size+i]*b[i*size+col]
threadsperblock = (16,16)
blockspergrid_x = int(np.ceil(size_array / threadsperblock[0]))
blockspergrid_y = int(np.ceil(size_array / threadsperblock[1]))
blockspergrid = (blockspergrid_x, blockspergrid_y)
start=time.time()
mat_mul[blockspergrid, threadsperblock](a,b,c,size_array)
end=time.time()
print('Elapsed time: ',end-start)
c=np.zeros(size_vec)
start=time.time()
mat_mul[blockspergrid, threadsperblock](a,b,c,size_array)
end=time.time()
print('Elapsed time: ',end-start)
if(validate):
a2d=np.reshape(a,(size_array,-1))
b2d=np.reshape(b,(size_array,-1))
c2d_numba=np.reshape(c,(size_array,-1))
c2d=np.matmul(a2d,b2d)
dif=np.abs(c2d_numba-c2d)
summation=np.sum(dif)
print(summation,summation/float(size_vec))
import numpy as np
import time
from numpy.random import seed
from numpy.random import rand
from numba import jit,prange
size_array=4096
size_vec=size_array*size_array
def generate_array(num,size):
seed(num)
lst=10.*rand(size)+1.
return lst
a=np.array(generate_array(1,size_vec))
b=np.array(generate_array(2,size_vec))
c=np.zeros(size_vec)
@jit
def mat_mul(a,b,c,size):
for i in range(size):
for k in range(size):
for j in range(size):
c[i*size+j]+=a[i*size+k]+b[k*size+j]
start=time.time()
c=mat_mul(a,b,c,size_array)
end=time.time()
print('Elapsed time: ',end-start)
c=np.zeros(size_vec)
start=time.time()
c=mat_mul(a,b,c,size_array)
end=time.time()
print('Elapsed time: ',end-start)
import numpy as np
import time
from numpy.random import seed
from numpy.random import rand
from numba import jit,prange
size_array=4096
size_vec=size_array*size_array
def generate_array(num,size):
seed(num)
lst=10.*rand(size)+1.
return lst
a=np.array(generate_array(1,size_vec))
b=np.array(generate_array(2,size_vec))
c=np.zeros(size_vec)
@jit(parallel=True)
def mat_mul(a,b,c,size):
for i in prange(size):
for k in range(size):
for j in range(size):
c[i*size+j]+=a[i*size+k]+b[k*size+j]
start=time.time()
c=mat_mul(a,b,c,size_array)
end=time.time()
print('Elapsed time: ',end-start)
c=np.zeros(size_vec)
start=time.time()
c=mat_mul(a,b,c,size_array)
end=time.time()
print('Elapsed time: ',end-start)
program montecarlo_pi
implicit none
integer i,countin
integer,parameter::npoints=100000000
double precision num,pi,pi_true,x,y,random_last
pi_true=acos(-1.d0)
countin=0
random_last=0
do i=1,npoints
call random_number(x)
call random_number(y)
if(x*x+y*y.le.1.d0)countin=countin+1
enddo
pi=dble(countin)/dble(npoints)*4.d0
print *,'Calculated: ',pi,'Real: ',pi_true
end program montecarlo_pi
program montecarlo_pi_v2
implicit none
integer i,countin,id
integer,parameter::npoints=100000000
double precision num,pi,pi_true,x,y
integer random_last
pi_true=acos(-1.d0)
countin=0
random_last=0
do i=1,npoints
call LCG_random(x,random_last)
call LCG_random(y,random_last)
if(x*x+y*y.le.1.d0)countin=countin+1
enddo
pi=dble(countin)/dble(npoints)*4.d0
print *,pi,pi_true
end program montecarlo_pi_v2
!***************************************************
subroutine LCG_random(num,random_last)
implicit none
integer,parameter::multiplier=1366,addend=150889,pmod=714025
integer,intent(inout)::random_last
double precision num
num=dble(mod(multiplier*random_last+addend,pmod))
random_last=num
num=num/dble(pmod)
return
end subroutine LCG_random
File added
pi.f90 0 → 100644
program pi_serial
implicit none
integer, parameter::steps=1000000000
integer i
double precision dx,x,sum,pi
pi=acos(-1.d0)
dx=1.d0/dble(steps)
sum=0.d0
do i=1,steps
x=(dble(i)-.5d0)*dx
sum=sum+4.d0/(1.d0+x*x)
enddo
sum=sum*dx
print *,sum,abs(pi-sum)/pi
end program pi_serial
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment