Homework #4 El6201 - Parallel System: 1 Openmp Matrix Addition

Download as pdf or txt
Download as pdf or txt
You are on page 1of 6

Homework #4 EL6201 - Parallel System

Ardianto Satriawan
23213079

Semester II 2013/2014

1 OpenMP Matrix Addition


For base comparison, a 500 500 matrix addition program is created as follows. Matrix A and
B are initialized in matrices500.h:

1 / /
2 / Ardianto S a t r i a w a n /
3 / S e m e s t e r I I 2013/14 /
4 / S e r i a l Matrix A d d i t i o n /
5 / EL6201 P a r a l l e l System /
6 / /
7
8 #d e f i n e LENGTH 500
9
10 #i n c l u d e <s t d i o . h>
11 #i n c l u d e <math . h>
12 #i n c l u d e <time . h>
13
14 / t h e m a t r i c e s t o add a r e i n t h i s h e a d e r f i l e , d e c l a r e d by e x t e r n /
15 #i n c l u d e m a t r i c e s 5 0 0 . h
16
17 i n t main ( i n t argc , c h a r argv [ ] ) {
18 clock t tic = clock () ;
19
20 / r e s u l t o f matrix a d d i t i o n /
21 i n t C [LENGTH ] [ LENGTH ] ;
22
23 / matrix a d d i t i o n c a l c u l a t i o n /
24 int i , j ;
25 f o r ( i =0; i <LENGTH; i ++) {
26 f o r ( j =0; j <LENGTH; j ++) {
27 C [ i ] [ j ] = A[ i ] [ j ] + B [ i ] [ j ] ;
28 }
29 }
30
31 c l o c k t toc = clock () ;
32 p r i n t f ( Elapsed : %f s e c o n d s \n , ( d o u b l e ) ( t o c t i c ) / CLOCKS PER SEC) ;
33 return 0;
34 }

matrixadd.c
I created 4 variants of OpenMP matrix addition, by adding one of those directives before nested
for loop:

1
#pragma omp parallel for

#pragma omp parallel for private(j)

#pragma omp parallel for schedule(dynamic)

#pragma omp parallel for schedule(dynamic,2)

Here are their code listings, the differences between 4 versions is just in line 26.
1 / /
2 / Ardianto S a t r i a w a n /
3 / S e m e s t e r I I 2013/14 /
4 / Matrix A d d i t i o n /
5 / u s i n g #pragma omp p a r a l l e l f o r d i r e c t i v e /
6 / EL6201 P a r a l l e l System /
7 / /
8
9 #d e f i n e LENGTH 500
10

11 #i n c l u d e <s t d i o . h>
12 #i n c l u d e <math . h>
13 #i n c l u d e <time . h>
14
15 / t h e m a t r i c e s t o add a r e i n t h i s h e a d e r f i l e , d e c l a r e d by e x t e r n /
16 #i n c l u d e m a t r i c e s 5 0 0 . h
17
18 i n t main ( i n t argc , c h a r argv [ ] ) {
19 clock t tic = clock () ;
20
21 / r e s u l t o f matrix a d d i t i o n /
22 i n t C [LENGTH ] [ LENGTH ] ;
23
24 / matrix a d d i t i o n c a l c u l a t i o n /
25 int i , j ;
26 #pragma omp p a r a l l e l f o r
27 f o r ( i =0; i <LENGTH; i ++) {
28 f o r ( j =0; j <LENGTH; j ++) {
29 C [ i ] [ j ] = A[ i ] [ j ] + B [ i ] [ j ] ;
30 }
31 }
32
33 c l o c k t toc = clock () ;
34 p r i n t f ( Elapsed : %f s e c o n d s \n , ( d o u b l e ) ( t o c t i c ) / CLOCKS PER SEC) ;
35 return 0;
36 }

matrixaddomp1.c

1 / /
2 / Ardianto S a t r i a w a n /
3 / S e m e s t e r I I 2013/14 /
4 / Matrix A d d i t i o n /
5 / u s i n g #pragma omp p a r a l l e l f o r p r i v a t e ( j ) /
6 / EL6201 P a r a l l e l System /
7 / /
8
9 #d e f i n e LENGTH 500
10
11 #i n c l u d e <s t d i o . h>
12 #i n c l u d e <math . h>

2
13 #i n c l u d e <time . h>
14

15 / t h e m a t r i c e s t o add a r e i n t h i s h e a d e r f i l e , d e c l a r e d by e x t e r n /
16 #i n c l u d e m a t r i c e s 5 0 0 . h
17
18 i n t main ( i n t argc , c h a r argv [ ] ) {
19 clock t tic = clock () ;
20

21 / r e s u l t o f matrix a d d i t i o n /
22 i n t C [LENGTH ] [ LENGTH ] ;
23
24 / matrix a d d i t i o n c a l c u l a t i o n /
25 int i , j ;
26 #pragma omp p a r a l l e l f o r p r i v a t e ( j )
27 f o r ( i =0; i <LENGTH; i ++) {
28 f o r ( j =0; j <LENGTH; j ++) {
29 C [ i ] [ j ] = A[ i ] [ j ] + B [ i ] [ j ] ;
30 }
31 }
32
33 c l o c k t toc = clock () ;
34 p r i n t f ( Elapsed : %f s e c o n d s \n , ( d o u b l e ) ( t o c t i c ) / CLOCKS PER SEC) ;
35 return 0;
36 }

matrixaddomp2.c

1 / /
2 / Ardianto S a t r i a w a n /
3 / S e m e s t e r I I 2013/14 /
4 / Matrix A d d i t i o n /
5 / u s i n g #pragma omp p a r a l l e l f o r s c h e d u l e ( dynamic ) /
6 / EL6201 P a r a l l e l System /
7 / /
8
9 #d e f i n e LENGTH 500
10
11 #i n c l u d e <s t d i o . h>
12 #i n c l u d e <math . h>
13 #i n c l u d e <time . h>
14
15 / t h e m a t r i c e s t o add a r e i n t h i s h e a d e r f i l e , d e c l a r e d by e x t e r n /
16 #i n c l u d e m a t r i c e s 5 0 0 . h
17
18 i n t main ( i n t argc , c h a r argv [ ] ) {
19 clock t tic = clock () ;
20
21 / r e s u l t o f matrix a d d i t i o n /
22 i n t C [LENGTH ] [ LENGTH ] ;
23
24 / matrix a d d i t i o n c a l c u l a t i o n /
25 int i , j ;
26 #pragma omp p a r a l l e l f o r s c h e d u l e ( dynamic )
27 f o r ( i =0; i <LENGTH; i ++) {
28 f o r ( j =0; j <LENGTH; j ++) {
29 C [ i ] [ j ] = A[ i ] [ j ] + B [ i ] [ j ] ;
30 }
31 }
32
33 c l o c k t toc = clock () ;
34 p r i n t f ( Elapsed : %f s e c o n d s \n , ( d o u b l e ) ( t o c t i c ) / CLOCKS PER SEC) ;

3
35 return 0;
36 }

matrixaddomp3.c
1 / /
2 / Ardianto S a t r i a w a n /
3 / S e m e s t e r I I 2013/14 /
4 / Matrix A d d i t i o n /
5 / #pragma omp p a r a l l e l f o r s c h e d u l e ( dynamic , 2 ) /
6 / EL6201 P a r a l l e l System /
7 / /
8

9 #d e f i n e LENGTH 500
10
11 #i n c l u d e <s t d i o . h>
12 #i n c l u d e <math . h>
13 #i n c l u d e <time . h>
14

15 / t h e m a t r i c e s t o add a r e i n t h i s h e a d e r f i l e , d e c l a r e d by e x t e r n /
16 #i n c l u d e m a t r i c e s 5 0 0 . h
17
18 i n t main ( i n t argc , c h a r argv [ ] ) {
19 clock t tic = clock () ;
20

21 / r e s u l t o f matrix a d d i t i o n /
22 i n t C [LENGTH ] [ LENGTH ] ;
23
24 / matrix a d d i t i o n c a l c u l a t i o n /
25 int i , j ;
26 #pragma omp p a r a l l e l f o r s c h e d u l e ( dynamic , 2 )
27 f o r ( i =0; i <LENGTH; i ++) {
28 f o r ( j =0; j <LENGTH; j ++) {
29 C [ i ] [ j ] = A[ i ] [ j ] + B [ i ] [ j ] ;
30 }
31 }
32

33 c l o c k t toc = clock () ;
34 p r i n t f ( Elapsed : %f s e c o n d s \n , ( d o u b l e ) ( t o c t i c ) / CLOCKS PER SEC) ;
35 return 0;
36 }

matrixaddomp4.c
The comparison between serial and 4 OpenMP variants can be viewed in the following table:

We can see that the #pragma omp parallel for private(j) make the program run 1.21 times
faster than serial version. The other directives make the program run slower.

2 OpenMP Matrix Multiplication


For base comparison, a 500 500 matrix multiplication program is created as follows. Matrix
A and B are initialized in matrices500.h:

4
1 / /
2 / Ardianto S a t r i a w a n /
3 / S e m e s t e r I I 2013/14 /
4 / S e r i a l Matrix M u l t i p l i c a t i o n /
5 / EL6201 P a r a l l e l System /
6 / /
7
8 #d e f i n e LENGTH 500
9

10 #i n c l u d e <s t d i o . h>
11 #i n c l u d e <math . h>
12 #i n c l u d e <time . h>
13
14 / t h e m a t r i c e s t o add a r e i n t h i s h e a d e r f i l e , d e c l a r e d by e x t e r n /
15 #i n c l u d e m a t r i c e s 5 0 0 . h
16
17 i n t main ( i n t argc , c h a r argv [ ] ) {
18 clock t tic = clock () ;
19
20 / r e s u l t o f matrix a d d i t i o n /
21 i n t C [LENGTH ] [ LENGTH ] ;
22
23 / matrix m u l t i p l i c a t i o n c a l c u l a t i o n /
24 int i , j , k ;
25 f o r ( i =0; i <LENGTH; i ++) {
26 f o r ( j =0; j <LENGTH; j ++) {
27 C[ i ] [ j ] = 0 ;
28 f o r ( k=0;k<LENGTH; k++) {
29 C [ i ] [ j ] += A[ i ] [ k ] B [ k ] [ j ] ;
30 }
31 }
32 }
33

34 c l o c k t toc = clock () ;
35 p r i n t f ( Elapsed : %f s e c o n d s \n , ( d o u b l e ) ( t o c t i c ) / CLOCKS PER SEC) ;
36 return 0;
37 }

matrixmul.c
Analyzing the data dependency of the operation C[i][j] += A[i][k] * B[k][j], we can see that
each iteration i is independent of each other. The same applies to iteration j. For simplicity,
we consider parallelizing only single for-loop by adding:

#pragma omp parallel for default(none) shared(A,B,C) private(j,k) before the nested for
loop.
1 / /
2 / Ardianto S a t r i a w a n /
3 / S e m e s t e r I I 2013/14 /
4 / Matrix M u l t i p l i c a t i o n OpenMP v a r i a n t /
5 / EL6201 P a r a l l e l System /
6 / /
7

8 #d e f i n e LENGTH 500
9
10 #i n c l u d e <s t d i o . h>
11 #i n c l u d e <math . h>
12 #i n c l u d e <time . h>

5
13
14 / t h e m a t r i c e s t o add a r e i n t h i s h e a d e r f i l e , d e c l a r e d by e x t e r n /
15 #i n c l u d e m a t r i c e s 5 0 0 . h
16
17 i n t main ( i n t argc , c h a r argv [ ] ) {
18 clock t tic = clock () ;
19
20 / r e s u l t o f matrix a d d i t i o n /
21 i n t C [LENGTH ] [ LENGTH ] ;
22
23 / matrix m u l t i p l i c a t i o n c a l c u l a t i o n /
24 int i , j , k ;
25
26 #pragma omp p a r a l l e l f o r d e f a u l t ( none ) s h a r e d (A, B, C) p r i v a t e ( j , k )
27 f o r ( i =0; i <LENGTH; i ++) {
28 f o r ( j =0; j <LENGTH; j ++) {
29 C[ i ] [ j ] = 0 ;
30 f o r ( k=0;k<LENGTH; k++) {
31 C [ i ] [ j ] += A[ i ] [ k ] B [ k ] [ j ] ;
32 }
33 }
34 }
35
36 c l o c k t toc = clock () ;
37 p r i n t f ( Elapsed : %f s e c o n d s \n , ( d o u b l e ) ( t o c t i c ) / CLOCKS PER SEC) ;
38 return 0;
39 }

matrixmulomp.c
With the OpenMP directive (#pragma), the i-for-loop is divided into multiple chunks, each
chunk is assigned to a thread. Hence, multiple threads can compute assigned chunks in parallel.
Performance comparison between serial and OpenMP version:

Speed-up:
2.64
Speed-up = = 1.88 times
1.40
The parallel version is about 1.88 times faster than the serial version on a dual-core system.
Which is pretty good.

You might also like