Commit a707aab2 authored by Ruben's avatar Ruben

course 2020

parent 2b021eb1
program mandelbrot_area
implicit none
integer,parameter::npoints=2000,mxitr=1000
double precision,parameter:: eps=1.d-5
integer i,j,numoutside
double precision area,error,rp,ri
numoutside=0
print *,eps,numoutside
do i=1,npoints
do j=1,npoints
rp=-2.d0+2.5d0*dble(i)/dble(npoints)+eps
ri=1.125d0*dble(j)/dble(npoints)+eps
call testpoint(rp,ri,numoutside,mxitr)
enddo
enddo
area=2.d0*2.5d0*1.125d0*dble(npoints*npoints-numoutside)/dble(npoints*npoints)
error=area/dble(npoints)
print *,'area: ',area,numoutside
print *,'error: ',error
end program mandelbrot_area
!***************************************************
subroutine testpoint(rp,ri,numoutside,mxitr)
implicit none
integer i
integer,intent(in)::mxitr
integer,intent(inout)::numoutside
double precision,intent(in)::rp,ri
double precision temp,crp,cri
crp=rp
cri=ri
do i=1,mxitr
temp=crp*crp-cri*cri+rp
cri=crp*cri*2.d0+ri
crp=temp
if(crp*crp+cri*cri.gt.4.d0) then
numoutside=numoutside+1
exit
endif
enddo
return
end subroutine testpoint
CXX=g++
CXXFLAGS=-O2 -Wall -fopenmp
NVCC=nvcc
NVCCFLAGS=-O2
RUN=
RANGE=64,128,256,512,1024#,2048,4096,8192
NTHREADS=4
N=64
all: naive order threads target cuda
naive:
$(CXX) $(CXXFLAGS) -o naive naive.cpp
order:
$(CXX) $(CXXFLAGS) -o order order.cpp
threads:
$(CXX) $(CXXFLAGS) -o threads threads.cpp
target:
$(CXX) $(CXXFLAGS) -o target target.cpp
cuda:
$(NVCC) $(NVCCFLAGS) -o cuda cuda.cu
.PHONY: clean run test
clean:
rm -f naive order factorized threads target cuda
rm -f time.*.txt
run: clean all
@echo -n "naive: " && $(RUN) ./naive $N
@echo -n "order: " && $(RUN) ./order $N
@echo -n "threads: " && $(RUN) ./threads $N
@echo -n "target: " && $(RUN) ./target $N
@echo -n "cuda: " && $(RUN) ./cuda $N
test: clean all
rm -f time.*.txt
export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./naive $$i >> time.naive.txt; done
export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./order $$i >> time.order.txt; done
export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./threads $$i >> time.threads.txt; done
export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./target $$i >> time.target.txt; done
export OMP_NUM_THREADS=$(NTHREADS); for i in {$(RANGE)}; do echo $$i && ./cuda $$i >> time.cuda.txt; done
plot:
python plot.py
\ No newline at end of file
#include <cstdio>
#include "utils.hpp"
__global__ void cuda_mul(float* m1, float* m2, float* m3, int n) {
int row = blockIdx.y*blockDim.y+threadIdx.y;
int col = blockIdx.x*blockDim.x+threadIdx.x;
for (int i = 0; i < n; i++)
m3[row * n + col] += m1[row * n + i] * m2[i * n + col];
}
void mul(const float *m1, const float *m2, float *m3, int n)
{
float *dm1 = nullptr, *dm2 = nullptr, *dm3 = nullptr;
cudaMalloc((void**)&dm1, sizeof(float) * n * n);
cudaMalloc((void**)&dm2, sizeof(float) * n * n);
cudaMalloc((void**)&dm3, sizeof(float) * n * n);
cudaMemcpy(dm1, m1, sizeof(float) * n * n, cudaMemcpyHostToDevice);
cudaMemcpy(dm2, m2, sizeof(float) * n * n, cudaMemcpyHostToDevice);
cudaMemcpy(dm3, m3, sizeof(float) * n * n, cudaMemcpyHostToDevice);
dim3 blockSize = dim3(16, 16);
dim3 gridSize = dim3(n / blockSize.x, n/ blockSize.y);
cuda_mul<<<gridSize, blockSize>>>(dm1, dm2, dm3, n);
cudaMemcpy(m3, dm3, sizeof(float) * n * n, cudaMemcpyDeviceToHost);
cudaFree(dm1);
cudaFree(dm2);
cudaFree(dm3);
}
int main(int argc, char **argv)
{
int n = 64;
if(argc > 1)
n = atoi(argv[1]);
srand(12);
float *m1 = nullptr, *m2 = nullptr, *m3 = nullptr;
cudaError_t err = cudaMallocHost((void**)&m1, sizeof(float) * n * n);
if(err) printf("Error status is %s\n", cudaGetErrorString(err));
err = cudaMallocHost((void**)&m2, sizeof(float) * n * n);
if(err) printf("Error status is %s\n", cudaGetErrorString(err));
err = cudaMallocHost((void**)&m3, sizeof(float) * n * n);
if(err) printf("Error status is %s\n", cudaGetErrorString(err));
random(m1, n);
random(m2, n);
random(m3, n);
zeros(m3, n);
TimePoint tstart, tstop;
tstart = Clock::now();
mul(m1, m2, m3, n);
tstop = Clock::now();
//printf("checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop));
printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop));
cudaFreeHost(m1);
cudaFreeHost(m2);
cudaFreeHost(m3);
}
#include <cstdio>
#include "utils.hpp"
void mul(const float *m1, const float *m2, float *m3, int n)
{
for(int i=0; i<n; ++i)
for(int j=0; j<n; ++j)
for(int k=0; k<n; ++k)
m3[i*n+j] += m1[i*n + k] * m2[k*n+j];
}
int main(int argc, char **argv)
{
int n = 64;
if(argc > 1)
n = atoi(argv[1]);
srand(12);
float *m1 = new float[n*n];
float *m2 = new float[n*n];
float *m3 = new float[n*n];
random(m1, n);
random(m2, n);
random(m3, n);
zeros(m3, n);
TimePoint tstart, tstop;
tstart = Clock::now();
mul(m1, m2, m3, n);
tstop = Clock::now();
//printf("Checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop));
printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop));
delete[] m1;
delete[] m2;
delete[] m3;
}
#include <cstdio>
#include "utils.hpp"
void mul(const float *m1, const float *m2, float *m3, int n)
{
for(int i=0; i<n; ++i)
for(int k=0; k<n; ++k)
for(int j=0; j<n; ++j)
m3[ i * n + j] += m1[i * n + k] * m2[k * n + j];
}
int main(int argc, char **argv)
{
int n = 64;
if(argc > 1)
n = atoi(argv[1]);
srand(12);
float *m1 = new float[n*n];
float *m2 = new float[n*n];
float *m3 = new float[n*n];
random(m1, n);
random(m2, n);
random(m3, n);
zeros(m3, n);
TimePoint tstart, tstop;
tstart = Clock::now();
mul(m1, m2, m3, n);
tstop = Clock::now();
//printf("Checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop));
printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop));
delete[] m1;
delete[] m2;
delete[] m3;
}
import os
import math
import numpy as np
from matplotlib import pyplot as plt
cuda = np.loadtxt('time.cuda.txt')
target = np.loadtxt('time.target.txt')
threads = np.loadtxt('time.threads.txt')
order = np.loadtxt('time.order.txt')
naive = np.loadtxt('time.naive.txt')
l = len(naive)
n = [64 * 2**i for i in range(0, l)]
print(l, n)
plt.figure(figsize=(5,4))
if cuda.any(): plt.plot(n, cuda[:,1], marker='x', linestyle='-', linewidth=2, markersize=6, color='C0', label=r"CUDA")#($f_{child} = 2, n_{p} = 64$)")
if target.any(): plt.plot(n, target[:,1], marker='v', linestyle='-', linewidth=2, markersize=6, color='C1', label=r"OpenMP Offloading")#($f_{child} = 2, n_{p} = 64$)")
if threads.any(): plt.plot(n, threads[:,1], marker='o', linestyle='-', linewidth=2, markersize=6, color='C2', label=r"OpenMP Threading")#($f_{child} = 2, n_{p} = 64$)")
if order.any(): plt.plot(n, order[:,1], marker='>', linestyle='-', linewidth=2, markersize=6, color='C4', label=r"Loop Order")#($f_{child} = 2, n_{p} = 64$)")
if naive.any(): plt.plot(n, naive[:,1], marker='<', linestyle='-', linewidth=2, markersize=6, color='C5', label=r"Naive")#($f_{child} = 2, n_{p} = 64$)")
plt.legend(loc='upper left')
plt.xscale('log')
plt.yscale('log')
plt.xlim(n[0], n[-1])
plt.xticks(n, n)
plt.yticks()
plt.draw()
plt.show()
\ No newline at end of file
#include <cstdio>
#include "utils.hpp"
void mul(const float *m1, const float *m2, float *m3, int n)
{
const int n2 = n * n;
#pragma omp target map(to: m1[0:n2], m2[0:n2]), map(tofrom: m3[0:n2])
#pragma omp teams distribute parallel for collapse(2)
for(int i=0; i<n; ++i)
{
for(int j=0; j<n; ++j)
{
for (int k = 0; k < n; k++)
m3[i * n + j] += m1[i * n + k] * m2[k * n + j];
}
}
}
int main(int argc, char **argv)
{
int n = 64;
if(argc > 1)
n = atoi(argv[1]);
srand(12);
float *m1 = new float[n*n];
float *m2 = new float[n*n];
float *m3 = new float[n*n];
random(m1, n);
random(m2, n);
random(m3, n);
zeros(m3, n);
TimePoint tstart, tstop;
tstart = Clock::now();
mul(m1, m2, m3, n);
tstop = Clock::now();
//printf("Checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop));
printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop));
delete[] m1;
delete[] m2;
delete[] m3;
}
#include <cstdio>
#include "utils.hpp"
void mul(const float *m1, const float *m2, float *m3, int n)
{
#pragma omp parallel for
for(int i=0; i<n; ++i)
for(int k=0; k<n; ++k)
for(int j=0; j<n; ++j)
m3[ i * n + j] += m1[i * n + k] * m2[k * n + j];
}
int main(int argc, char **argv)
{
int n = 64;
if(argc > 1)
n = atoi(argv[1]);
srand(12);
float *m1 = new float[n*n];
float *m2 = new float[n*n];
float *m3 = new float[n*n];
random(m1, n);
random(m2, n);
random(m3, n);
zeros(m3, n);
TimePoint tstart, tstop;
tstart = Clock::now();
mul(m1, m2, m3, n);
tstop = Clock::now();
//printf("Checksum: %f -- Time: %fms\n", checksum(m3, n), elapsedTime(tstart, tstop));
printf("%f %f\n", checksum(m3, n), elapsedTime(tstart, tstop));
delete[] m1;
delete[] m2;
delete[] m3;
}
#include <ctime>
#include <cstdlib>
#include <chrono>
typedef std::chrono::high_resolution_clock Clock;
typedef std::chrono::time_point<Clock> TimePoint;
typedef std::chrono::duration<double> Time;
void zeros(float *m, int n)
{
for(int i=0; i<n*n; ++i)
m[i] = 0;
}
void random(float *m, int n)
{
for(int i=0; i<n*n; ++i)
m[i] = (rand() % 1000 - 500) / 1000.0;
}
float checksum(const float *m3, int n)
{
float sum = 0.0f;
for(int i=0; i<n*n; i++)
sum += m3[i];
return sum;
}
double elapsedTime(TimePoint start, TimePoint stop)
{
return std::chrono::duration_cast<Time>(stop-start).count();
}
program matmul_prog
implicit none
integer icrow,iccol
integer ic,i,j
integer,parameter :: a_row=840, a_col=3230, a_elem=a_row*a_col
integer,parameter :: b_row=3230, b_col=802, b_elem=b_row*b_col
integer,parameter :: c_row=a_row, c_col=b_col, c_elem=c_row*c_col
integer, dimension(12) :: seed=(/3,6,1,2,8,4,2,9,0,4,5,6/)
double precision, dimension(a_row,a_col) :: a
double precision, dimension(b_row,b_col) :: b
double precision, dimension(c_row,c_col) :: c,c_ref
double precision, dimension(a_elem) :: a_temp
double precision, dimension(b_elem) :: b_temp
double precision num,sum,t1,t2
logical, parameter :: validation=.false.
logical, parameter :: print_result=.false.
if(a_col.ne.b_row) stop 'Wrong matrices dimensions'
!generate initial arrays
call random_seed(put=seed)
do i=1,a_elem
call random_number(num)
a_temp(i)=10.d0*num+1.d0
enddo
do i=1,b_elem
call random_number(num)
b_temp(i)=10.d0*num+1.d0
enddo
a = transpose(reshape(a_temp,(/ size(a, 2), size(a, 1) /)))
b = transpose(reshape(b_temp,(/ size(b, 2), size(b, 1) /)))
call cpu_time(t1)
!matrix multiplication
c=0.d0
do ic=1,c_elem
icrow=mod(ic-1,c_row)+1
iccol=(ic-1)/c_row+1
do i=1,a_col
c(icrow,iccol)=c(icrow,iccol)+a(icrow,i)*b(i,iccol)
enddo
enddo
call cpu_time(t2)
print *,'Elapsed time:',t2-t1
!Print result
if(print_result) then
do icrow=1,c_row
print '(20(1x,f12.5))', (c(icrow,iccol),iccol=1,c_col)
enddo
endif
!Validation
if(validation) then
c_ref=matmul(a,b)
do i=1,c_row
do j=1,c_col
sum=sum+c(i,j)-c_ref(i,j)
enddo
enddo
print *,sum,sum/dble(c_elem)
endif
end program matmul_prog
program matmul_prog
use omp_lib
implicit none
integer icrow,iccol
integer ic,i,j,num_devices
integer,parameter :: a_row=840, a_col=3230, a_elem=a_row*a_col
integer,parameter :: b_row=3230, b_col=802, b_elem=b_row*b_col
integer,parameter :: c_row=a_row, c_col=b_col, c_elem=c_row*c_col
integer, dimension(12) :: seed=(/3,6,1,2,8,4,2,9,0,4,5,6/)
double precision, dimension(a_row,a_col) :: a
double precision, dimension(b_row,b_col) :: b
double precision, dimension(c_row,c_col) :: c,c_ref
double precision, dimension(a_elem) :: a_temp
double precision, dimension(b_elem) :: b_temp
double precision num,sum,t1,t2
logical, parameter :: validation=.true.
logical, parameter :: print_result=.false.
if(a_col.ne.b_row) stop 'Wrong matrices dimensions'
num_devices = omp_get_num_devices()
!generate initial arrays
call random_seed(put=seed)
do i=1,a_elem
call random_number(num)
a_temp(i)=10.d0*num+1.d0
enddo
do i=1,b_elem
call random_number(num)
b_temp(i)=10.d0*num+1.d0
enddo
a = transpose(reshape(a_temp,(/ size(a, 2), size(a, 1) /)))
b = transpose(reshape(b_temp,(/ size(b, 2), size(b, 1) /)))
t1=OMP_GET_WTIME()
!matrix multiplication
c=0.d0
!$omp target map(tofrom:c) map(to:a,b)
!$omp teams distribute parallel do private(ic,icrow,iccol,i)
do ic=1,c_elem
icrow=mod(ic-1,c_row)+1
iccol=(ic-1)/c_row+1
do i=1,a_col
c(icrow,iccol)=c(icrow,iccol)+a(icrow,i)*b(i,iccol)
enddo
enddo
!$omp end target
t2=OMP_GET_WTIME()
print *,'Elapsed time:',t2-t1
!Print result
if(print_result) then
do icrow=1,c_row
print '(20(1x,f12.5))', (c(icrow,iccol),iccol=1,c_col)
enddo
endif
!Validation
if(validation) then
c_ref=matmul(a,b)
do i=1,c_row
do j=1,c_col
sum=sum+c(i,j)-c_ref(i,j)
enddo
enddo
print *,sum,sum/dble(c_elem)
endif
end program matmul_prog
program matmul_prog
implicit none
integer icrow,iccol
integer ic,i,j
integer,parameter :: a_row=840, a_col=3230, a_elem=a_row*a_col
integer,parameter :: b_row=3230, b_col=802, b_elem=b_row*b_col
integer,parameter :: c_row=a_row, c_col=b_col, c_elem=c_row*c_col
integer, dimension(12) :: seed=(/3,6,1,2,8,4,2,9,0,4,5,6/)
double precision, dimension(a_row,a_col) :: a
double precision, dimension(b_row,b_col) :: b
double precision, dimension(c_row,c_col) :: c,c_ref
double precision, dimension(a_elem) :: a_temp
double precision, dimension(b_elem) :: b_temp
double precision num,sum,t1,t2,OMP_GET_WTIME
logical, parameter :: validation=.true.
logical, parameter :: print_result=.false.
if(a_col.ne.b_row) stop 'Wrong matrices dimensions'
!generate initial arrays
call random_seed(put=seed)
do i=1,a_elem
call random_number(num)
a_temp(i)=10.d0*num+1.d0
enddo
do i=1,b_elem
call random_number(num)
b_temp(i)=10.d0*num+1.d0
enddo
a = transpose(reshape(a_temp,(/ size(a, 2), size(a, 1) /)))
b = transpose(reshape(b_temp,(/ size(b, 2), size(b, 1) /)))
t1=OMP_GET_WTIME()
!matrix multiplication
c=0.d0
!$omp parallel private(ic,icrow,iccol,i)
!$omp do schedule(static)
do ic=1,c_elem
icrow=mod(ic-1,c_row)+1
iccol=(ic-1)/c_row+1
do i=1,a_col
c(icrow,iccol)=c(icrow,iccol)+a(icrow,i)*b(i,iccol)
enddo
enddo
!$omp end do
!$omp end parallel
t2=OMP_GET_WTIME()