Problème code multi-threadé avec fonction récursive

**fab13** · 27/05/2011, 10h38

Bonjour,

j'ai développé, en reprenant un programme fait pour tourner sur 2 coeurs, une petite apllication du calcul de pi en essayant de le généraliser à un nombre quelconque de threads (à condition d'avoir le rapport (nombre d'itérations/nombre de thread) entier ).

Le code original (pour 2 coeurs) est le suivant :

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
 
/* parallel computation of pi with monte carlo method */ 
#include <stdio.h> 
#include <stdlib.h>
#include <unistd.h> 
#include <fcntl.h> 
#include <sys/types.h> 
#include <sys/stat.h> 
#include <sys/mman.h> 
#include <sys/wait.h> 
#include <sys/time.h> 
#define SEED 35791246 
 
struct sharedMem { double child_p_in; };
 
 
 int main (int argc, char** argv) 
 
 { double parent_p_in; // number of hits inside the circle 
   double niter; // total number of hits 
   int fd; int i; 
   double x,y,z,pi;
   struct sharedMem* smem; 
   struct timeval chrono1, chrono2; 
   int micro, second; pid_t pid; 
 
  // parse the second argument 
  if (argc != 2)
   { printf ("Please, specify interval number\n"); 
     exit (1); } 
     niter = (double) atoi(argv[1]); 
 
  // create the shared memory 
  fd = open ("/tmp/myregion", O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
   if (fd == -1) exit(2);
 
    if (ftruncate (fd, sizeof (struct sharedMem)) == -1) exit(3); 
 
    smem = mmap (NULL, sizeof (struct sharedMem),
              PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
     if (smem == MAP_FAILED) exit(1); 
 
 
     srand(SEED); 
 
     // initialize random numbers 
     gettimeofday(&chrono1, NULL);
 
     // monte carlo's method in parallel 
 
     pid = fork(); 
     if (pid < 0) exit(4); 
     else if (pid == 0) { 
      // child process
      // compute the first half of niter hits 
 
      smem->child_p_in = 0; 
 
      for (i=0; i<(niter/2); i++) 
      { x = (double) rand() / RAND_MAX; 
      y = (double) rand() / RAND_MAX; 
      z = x*x + y*y; 
      if (z <= 1.0) 
       smem->child_p_in++; } 
 
       exit(0); }
       else { 
 
       // parent process // compute the second half of niter hits 
       parent_p_in = 0; 
       for (i=(niter/2); i<niter; i++) 
       { x = (double) rand() / RAND_MAX;
        y = (double) rand() / RAND_MAX; 
	z = x*x + y*y; 
	if (z <= 1.0) parent_p_in++; } 
 
 
	wait(NULL); 
	// wait for child 
 
	}
 
	gettimeofday(&chrono2, NULL); 
//	munmap(smem, sizeof(struct sharedMem)); 
	close(fd); 
	// compute ellapsed time 
	micro = chrono2.tv_usec - chrono1.tv_usec; 
	if (micro < 0) 
	{ micro += 1000000; 
	second = chrono2.tv_sec - chrono1.tv_sec - 1; } 
	else 
	second = chrono2.tv_sec - chrono1.tv_sec;
 
	pi = (parent_p_in + smem->child_p_in)/niter; 
        pi *= 4; 
 
	// special format fot limted niter parameter < 10e9
 
	printf("# of trials= %d , estimate of pi is %f \n",(int) niter,pi);
	printf("%d second %d micro ellapsed\n", second, micro);
 
  return 0;
 
   }

On se sert du fork() pour que chaque thread génère (n_iterations/2) nombres aléatoires.
A l'exécution, ce programme tourne effectivement sur 2 coeurs.

Pour le généraliser, j'ai codé une fonction récursive qui par exemple, pour un nombre de thread "nb_thread" égale à 4, va utiliser 2 fork(). Je récupère le nombre de thread et le nombre d'itérations en paramètres de l'exécutable.

Voici la procédure récursive :

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
 
/* parallel computation of pi with monte carlo method */ 
#include <stdio.h> 
#include <stdlib.h>
#include <unistd.h> 
#include <fcntl.h> 
#include <sys/types.h> 
#include <sys/stat.h> 
#include <sys/mman.h> 
#include <sys/wait.h> 
#include <sys/time.h> 
#define SEED 35791246 
 
struct sharedMem { double j[100]; };
 
 
struct sharedMem* Parallel( struct sharedMem* smem, pid_t* pid, int nbt, long int niter, int statut, int options, int nb_current )
{
  double x,y,z;
  long int i;
     if (nbt==1)  
               { smem->j[nb_current]=0;
                 for (i=0; i<(niter/nbt); i++) 
                      { x = (double) rand() / RAND_MAX; 
                        y = (double) rand() / RAND_MAX; 
                        z = x*x + y*y;
                        if (z <= 1.0) 
                 {smem->j[nb_current]++;}}
		 return smem;
		 }
 else if (nb_current==(nbt-1)) return smem;
     else {  pid[nb_current]=fork();          
     	     if (pid[nb_current] < 0) exit(4); 
                else if (pid[nb_current]==0) 
		    {// child process
                      // compute the niter/nbt part of niter hits 
                      smem->j[nb_current]=0;
                    for (i=0; i<(niter/nbt); i++) 
                      { x = (double) rand() / RAND_MAX; 
                        y = (double) rand() / RAND_MAX; 
                        z = x*x + y*y;
                        if (z <= 1.0) 
                         {smem->j[nb_current]++;}}
			 exit(0); 
		      }
		    else {
		      // parent process 
		      // compute the niter/nbt part of niter hits 
                      smem->j[nb_current+1] = 0;		       		      
                      for (i=0; i<(niter/nbt); i++) 
                      { x = (double) rand() / RAND_MAX;
                        y = (double) rand() / RAND_MAX; 
	                z = x*x + y*y; 
	                if (z <= 1.0) smem->j[nb_current+1]++; }
                        waitpid(pid[nb_current],&statut,options);			
         	        return Parallel(smem,pid,nbt,niter,statut,options,nb_current+1);   	 
                      }
		   exit(0);   
 
	  }
 
 }  
 
.....

Pour créer la mémoire partagée, j'utilise comme pour l'algo original :

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
 
 // create the shared memory 
  fd = open ("/tmp/myregion", O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
   if (fd == -1) exit(2);
 
    if (ftruncate (fd, sizeof (struct sharedMem)) == -1) exit(3); 
 
    smem = mmap (NULL, sizeof (struct sharedMem),
              PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
     if (smem == MAP_FAILED) exit(1);

et avec l'appel de la fonction récursive :

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
 
 // recursive function 
 
     smem=Parallel(smem,pid,nb_thread,niter,statut,options,nb_current);

Mais voilà, le problème est qu'à l'exécution, pour un nombre de processus supérieur à 2 et inférieur ou égal à 8 (j'ai un processeur 8 cores), le programme ne tourne pas sur le nombre de cores correspondant ( vérifié grâce à la commande "htop" ou "ps aux | grep a.out" ) :

Pour nb_thread=4, j'aimerais qu'il tourne sur 4, pour 6, sur 6 ...

Il donne bien le résultat attendu, à savoir une estimation de pi, d'autant plus précise que le nombre d'itérations est grand,

Mais pourquoi ne tourne t-il pas sur le nombre de coeurs attendu comme j'aimerais (du moins pour nb_thread compris entre 4 et 8) ?

Est-ce que ceci est du au fait que je fait le fork() dans une fonction et non dans le "main" ?

Si quelqu'un pouvait m'aider à voir plus clair,

Merci d'avance.

**gangsoleil** · 27/05/2011, 14h15

Bonjour,

Envoyé par fab13

le problème est qu'à l'exécution, pour un nombre de processus supérieur à 2 et inférieur ou égal à 8 (j'ai un processeur 8 cores), le programme ne tourne pas sur le nombre de cores correspondant ( vérifié grâce à la commande "htop" ou "ps aux | grep a.out" ) :

Pour nb_thread=4, j'aimerais qu'il tourne sur 4, pour 6, sur 6 ...

Que vois-tu comme comportement, et a quoi t'attendrais-tu ?

**matafan** · 27/05/2011, 14h25

En passant, ton code n'utilise pas de threads, il utilise des process.

**fab13** · 27/05/2011, 16h33

Envoyé par gangsoleil

Bonjour,

Que vois-tu comme comportement, et a quoi t'attendrais-tu ?

Quand je lance par exemple mon application (avec nb_thread=4 et n_iterations=1000000000), je ne vois que 2 process qui tournent :

$ ps aux | grep a.out
fab 18215 20.6 0.0 3896 492 pts/1 R 15:32 0:08 ./a.out 4 1000000000
fab 18216 20.6 0.0 3896 220 pts/1 R 15:32 0:08 ./a.out 4 1000000000

avec le résultat de la commande htop :

[IMG]
https://picasaweb.google.com/1026685...91128855362226
[/IMG]

On ne voit donc que 2 coeurs sollicités au lieu de 4 attendus.

Voici une version du code qui tourne bien sur 4 coeurs :

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
 
/* parallel computation of pi with monte carlo method */ 
#include <stdio.h> 
#include <stdlib.h>
#include <unistd.h> 
#include <fcntl.h> 
#include <sys/types.h> 
#include <sys/stat.h> 
#include <sys/mman.h> 
#include <sys/wait.h> 
#include <sys/time.h> 
#define SEED 35791246 
 
struct sharedMem { double j[4]; };
 
 
 int main (int argc, char** argv) 
 
 { long int niter; // total number of hits 
   long int fd;long int i;
 
   int nb_thread=4; // number of threads 
   double x,y,z;
   long double pi;
   struct sharedMem* smem; 
   struct timeval chrono1, chrono2; 
   int micro, second; pid_t pid1,pid2; 
   int pseudo_pid;
   int statut;
   int options = 0;   
 
 
  // parse the second argument 
  if (argc != 2)
   { printf ("Please, specify interval number and number of threads\n"); 
     exit (1); } 
 
     niter = atol(argv[1]);   
 
     printf("nb_thread=%d\n",nb_thread);
     printf("niter=%ld\n",niter);
 
 
  // create the shared memory 
  fd = open ("/tmp/myregion", O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
   if (fd == -1) exit(2);
 
    if (ftruncate (fd, sizeof (struct sharedMem)) == -1) exit(3); 
 
    smem = mmap (NULL, sizeof (struct sharedMem),
              PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
     if (smem == MAP_FAILED) exit(1); 
 
 
     srand(SEED); 
 
     // initialize random numbers 
     gettimeofday(&chrono1, NULL);
 
     // monte carlo's method in parallel 
 
     pid1=fork();
 
 
     if (pid1 < 0) exit(4); 
 
     else if (pid1 == 0) { 
 
          pid2=fork();	       
          if (pid2 < 0) exit(4); 
 
     else if (pid2 == 0) { 
 
      // child process
      // compute the first quarter of niter hits 
 
      smem->j[0]=0;
      for (i=0; i<(niter/nb_thread); i++) 
      { x = (double) rand() / RAND_MAX; 
      y = (double) rand() / RAND_MAX; 
      z = x*x + y*y;
      if (z <= 1.0) 
       smem->j[0]++;
 } 
 
       exit(0); 
}
       else { 
 
       // parent process // compute the second quart of niter hits 
       smem->j[1] = 0; 
       for (i=(niter/nb_thread); i<(2*niter/nb_thread); i++) 
       { x = (double) rand() / RAND_MAX;
        y = (double) rand() / RAND_MAX; 
	z = x*x + y*y; 
	if (z <= 1.0) smem->j[1]++; } 
 
        waitpid(pid2,&statut,options);
 
	}
	exit(0);
	}
 
      else  {
 
       pid2=fork();	  
       if (pid2 < 0) exit(4); 
 
     else if (pid2 == 0) {
 
     // child process
      // compute the third half of niter hits 
 
      smem->j[2]=0;
      for (i=2*niter/nb_thread; i<(3*niter/nb_thread); i++) 
      { x = (double) rand() / RAND_MAX; 
      y = (double) rand() / RAND_MAX; 
      z = x*x + y*y;
      if (z <= 1.0) 
       smem->j[2]++;
 } 
 
       exit(0); 
 
       }
       else { 
 
       // parent process // compute the forth half of niter hits 
       smem->j[3] = 0; 
       for (i=3*niter/nb_thread; i<niter; i++) 
       { x = (double) rand() / RAND_MAX;
        y = (double) rand() / RAND_MAX; 
	z = x*x + y*y; 
	if (z <= 1.0) smem->j[3]++; } 
 
        waitpid(pid2,&statut,options);
 
 
	}
        waitpid(pid1,&statut,options);	
	}
 
 
 
	gettimeofday(&chrono2, NULL); 
 
	close(fd); 
	// compute ellapsed time 
	micro = chrono2.tv_usec - chrono1.tv_usec; 
	if (micro < 0) 
	{ micro += 1000000; 
	second = chrono2.tv_sec - chrono1.tv_sec - 1; } 
	else 
	second = chrono2.tv_sec - chrono1.tv_sec;
 
 
	pi = (long double) ( smem->j[0] + smem->j[1] + smem->j[2] +smem->j[3])/niter; 
        pi *= 4; 
 
	// special format fot limted niter parameter < 10e9
 
	printf("# of trials= %ld , estimate of pi is %1.8Lf \n",niter,pi);
	printf("%d second %d micro ellapsed\n", second, micro);
 
  return 0;
 
   }

avec :

$ ps aux | grep a.out
fab 18449 86.1 0.0 3768 500 pts/1 R 15:48 0:05 ./a.out 10000000000000
fab 18450 86.3 0.0 3768 212 pts/1 R 15:48 0:05 ./a.out 10000000000000
fab 18451 86.3 0.0 3768 212 pts/1 R 15:48 0:05 ./a.out 10000000000000
fab 18452 86.1 0.0 3768 212 pts/1 R 15:48 0:05 ./a.out 10000000000000

et htop :

[IMG]
https://picasaweb.google.com/1026685...92349965329410
[/IMG]

On voit bien que les 4 cores sont sollicités avec 4 process qui tournent.

Vous pouvez voir, dans cette version "4 coeurs", que j'ai écrit "séquentiellement" les 2 fork() avec les conditions "if" pour savoir si je suis dans le processus père-père, père-fils, fils-père ou fils-fils.

C'est pour cela que j'ai voulu généraliser cet algo pour un nombre de process quelconque grâce à la procédure récusive de mon premier post. J'aimerais qu'au moins, jusqu'à 8 threads, je puisse avoir 8 process "a.out" qui s'affichent dans "ps aux | grep a.out" et voir les 8 coeurs sollicités lors de l'exécution avec htop.

Pour un nombre de thread supérieur à 8, je ne sais pas quel est le résultat attendu, en fait j'aimerais déjà régler le problème pour égal ou inférieur à 8.

Merci par avance

**fab13** · 29/05/2011, 07h11

ça avance, j'ai reécrit ma fonction récursive car elle ne me semblait pas tout à fait correcte :

voici le code en entier :

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
 
 
/* parallel computation of pi with monte carlo method */ 
#include <stdio.h> 
#include <stdlib.h>
#include <unistd.h> 
#include <fcntl.h> 
#include <sys/types.h> 
#include <sys/stat.h> 
#include <sys/mman.h> 
#include <sys/wait.h> 
#include <sys/time.h> 
 
 
 
 
 
double* Parallel( double* smem, pid_t* pid, int nbt, long int niter, int statut, int options, int nb_current )
{
  double x,y,z;
  long int i;
     if (nbt==1)  
               { smem[nb_current]=0;
                 for (i=0; i<(niter/nbt); i++) 
                      { x = (double) rand() / RAND_MAX; 
                        y = (double) rand() / RAND_MAX; 
                        z = x*x + y*y;
                        if (z <= 1.0) 
                 {smem[nb_current]++;}}		
               return smem;		 
		 }
 else if (nb_current==nbt) return smem;
     else {  pid[nb_current]=fork();                     
             if (pid[nb_current]<0) {exit(4);}
                else if (pid[nb_current]==0)
                {     smem=Parallel(smem,pid,nbt,niter,statut,options,nb_current+2);
                      // child process 
		      // compute the niter/nbt part of niter hits 
                      smem[nb_current] = 0;			      
                      for (i=0; i<(niter/nbt); i++) 
                      { x = (double) rand() / RAND_MAX;
                        y = (double) rand() / RAND_MAX; 
	                z = x*x + y*y; 
	                if (z <= 1.0) {smem[nb_current]++; }
			}
                    exit(0);
                }
		else
		 {    // parent process 
		      // compute the niter/nbt part of niter hits 
                      smem[nb_current+1] = 0;		       		      
                      for (i=0; i<(niter/nbt); i++) 
                      { x = (double) rand() / RAND_MAX;
                        y = (double) rand() / RAND_MAX; 
	                z = x*x + y*y; 
	                if (z <= 1.0) {smem[nb_current+1]++; }
			}
                        waitpid(pid[nb_current],&statut,options);			
 
 
                 }
 
                      return smem;}
 
 }  
 
 
 int main (int argc, char** argv) 
 
 { long int niter; // total number of hits 
   long int fd;long int i;
 
   int nb_thread; // number of threads 
   int nb_current=0; // current index in Parallel recursive function
   double x,y,z;
   double pi=0;
//   struct sharedMem* smem;
 
   double* smem;
   struct timeval chrono1, chrono2; 
   int micro, second; pid_t *pid; 
   int pseudo_pid;
   int statut;
   int options = 0;   
 
 
  // parse the argument 
  if (argc != 3)
   { printf ("Please, specify number of threads and iterations \n"); 
     exit (1); } 
 
     nb_thread = atoi(argv[1]);        
     niter = atol(argv[2]);   
 
 
  // create the shared memory 
  fd = open ("/tmp/myregion", O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
   if (fd == -1) exit(2);
 
    if (ftruncate (fd, nb_thread*sizeof (double)) == -1) exit(3); 
 
    smem = mmap (NULL, nb_thread*sizeof (double),
              PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
     if (smem == MAP_FAILED) exit(1); 
 
 
     // initialize pid and sharedMem j pointer
 
     pid=(pid_t*)malloc(nb_thread*sizeof(pid_t));
 
     srand(time(NULL));   
 
 
     // initialize random numbers 
     gettimeofday(&chrono1, NULL);
 
     // monte carlo's method in parallel 
 
     smem=Parallel(smem,pid,nb_thread,niter,statut,options,nb_current);
 
 
	gettimeofday(&chrono2, NULL); 
 
	close(fd); 
	// compute ellapsed time 
	micro = chrono2.tv_usec - chrono1.tv_usec; 
	if (micro < 0) 
	{ micro += 1000000; 
	second = chrono2.tv_sec - chrono1.tv_sec - 1; } 
	else 
	second = chrono2.tv_sec - chrono1.tv_sec;
 
	for(i=0;i<nb_thread;i++) 
	{printf("smem[%d]=%f\n",i,smem[i]);
	 pi = pi+ smem[i];}
	 pi= pi/niter;
	 pi *= 4; 
 
	// Output
 
	printf("# of trials= %ld , estimate of pi is %1.8f \n",niter,pi);
	printf("%d second %d micro ellapsed\n", second, micro);
 
  /* Don't forget to free the mmapped memory
     */
    if (munmap(smem, nb_thread*sizeof (double)) == -1) {
	printf("Error un-mmapping the file\n");
	/* Decide here whether to close(fd) and exit() or not. Depends... */
    }
 
    /* Un-mmaping doesn't close the file, so we still need to do that.
     */
    close(fd);
 
 
  return 0;
 
   }

Je récupère le nombre de thread comme premier argument (nb_thread) et le nombre d'itérations en second paramètre (niter).

Voici la fonction récursive :

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
 
 
double* Parallel( double* smem, pid_t* pid, int nbt, long int niter, int statut, int options, int nb_current )
{
  double x,y,z;
  long int i;
     if (nbt==1)  
               { smem[nb_current]=0;
                 for (i=0; i<(niter/nbt); i++) 
                      { x = (double) rand() / RAND_MAX; 
                        y = (double) rand() / RAND_MAX; 
                        z = x*x + y*y;
                        if (z <= 1.0) 
                 {smem[nb_current]++;}}		
               return smem;		 
		 }
 else if (nb_current==nbt) return smem;
     else {  pid[nb_current]=fork();                     
             if (pid[nb_current]<0) {exit(4);}
                else if (pid[nb_current]==0)
                {     smem=Parallel(smem,pid,nbt,niter,statut,options,nb_current+2);
                      // child process 
		      // compute the niter/nbt part of niter hits 
                      smem[nb_current] = 0;			      
                      for (i=0; i<(niter/nbt); i++) 
                      { x = (double) rand() / RAND_MAX;
                        y = (double) rand() / RAND_MAX; 
	                z = x*x + y*y; 
	                if (z <= 1.0) {smem[nb_current]++; }
			}
                    exit(0);
                }
		else
		 {    // parent process 
		      // compute the niter/nbt part of niter hits 
                      smem[nb_current+1] = 0;		       		      
                      for (i=0; i<(niter/nbt); i++) 
                      { x = (double) rand() / RAND_MAX;
                        y = (double) rand() / RAND_MAX; 
	                z = x*x + y*y; 
	                if (z <= 1.0) {smem[nb_current+1]++; }
			}
                        waitpid(pid[nb_current],&statut,options);			
 
 
                 }
 
                      return smem;}
 
 }

Pour nb_thread=2, j'ai bien 2 process qui tournent, mais en faisant un print du nombre respectif de valeurs aléatoires (de chaque thread) étant dans le quart de disque (pour le calcul de pi), je m'aperçois qu'elles sont strictement égales alors que je fais un random (pour x et pour y) bien distinct dans le processus fils et le processus père.

le paramètre nb_current est censé représenter la thread courante, je stocke le pid de celle-ci dans le tableau pid[nb_current] et les nombres aléatoires sont stockés dans smem[nb_current] pour le processus fils et smem[nb_current+1] dans le processus père.

Pour l'appel récursif, j'incrémente nb_current de 2, ce qui fait qu'au prochaine appel, je stocke les nombres aléatoires dans smem[nb_current+2] et smem[nb_current+3] pour fils et père et ainsi de suite jusqu'à atteindre nb_current=nb_thread.

Autrement dit, pour nb_thread=4, je fais 2 fork() et je sauve les tirages dans smem[0], smem[1], smem[3] et smem[4].

Mais voilà, le code me donne des valeurs identiques entre les tirages.

Exemple :

$ ./a.out 2 100000000 ==================> 2 threads et niter=10^8
smem[0]=39274703.000000 ==============> premier random de x et y sur 5.10^7
smem[1]=39274703.000000 ==============> deuxième random de x et y sur 5.10^7
# of trials= 100000000 , estimate of pi is 3.14197624
2 second 838424 micro ellapsed

Maintenant pour nb_thread=4 :
./a.out 4 100000000 ====================> 4 threads niter=10^8
smem[0]=19636177.000000
smem[1]=19637354.000000 =================|
smem[2]=19637354.000000 =================| valeurs identiques malgrè un rand()
smem[3]=19637354.000000 =================| dans chacun des process
# of trials= 100000000 , estimate of pi is 3.14192956
2 second 906183 micro ellapsed

Il doit y avoir d'après moi un écrasement des valeurs entre les éléments du tableau smem[1], smem[2] et smem[3].

Est-ce mmap qui est en cause ou plutôt l'appel récursif dans le main :

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
 
   // monte carlo's method in parallel 
 
     smem=Parallel(smem,pid,nb_thread,niter,statut,options,nb_current);

?

Quelqu'un verrait-il où se trouve l'erreur ?

**gangsoleil** · 30/05/2011, 11h25

Bonjour,

Attention, tu confonds threads (que tu n'utilises pas) et processus. ps n'affiche d'ailleurs pas le nombre de threads, mais la liste des processus.

A part ca, random ne fonctionne pas comme tu le crois. Je te conseille de lire les pages de manuel de rand et de srand.

**fab13** · 02/06/2011, 02h53

En effet, la fonction "srand(time(NULL))" a du être changée car elle me générait la même suite de nombres aléatoires.

J'ai utilisé la fonction assembleur "rdtsc()" pour résoudre ce problème. Voici donc le code qui marche bien :

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
 
 
/* parallel computation of pi with monte carlo method */ 
#include <stdio.h> 
#include <stdlib.h>
#include <unistd.h> 
#include <fcntl.h> 
#include <sys/types.h> 
#include <sys/stat.h> 
#include <sys/mman.h> 
#include <sys/wait.h> 
#include <sys/time.h> 
#include <sys/ipc.h>
#include <sys/shm.h>
 
 
int rdtsc()
{
    __asm__ __volatile__("rdtsc");
}
 
 
 
double* Parallel( double* smem, pid_t* pid, int nbt, long int niter, int statut, int options, int nb_current )
{
  double x,y,z;
  long int i;
  int j;
 
  srand(rdtsc());     
 
 
 
     if (nbt==1)  
               { smem[nb_current]=0;
                 for (i=0; i<(niter/nbt); i++) 
                      { x = (double) rand() / RAND_MAX; 
                        y = (double) rand() / RAND_MAX; 
                        z = x*x + y*y;
                        if (z <= 1.0) 
                 {smem[nb_current]++;}}		
               return smem;		 
		 }
 else if (nb_current==nbt) return smem;
     else {  pid[nb_current]=fork();
             if (pid[nb_current]<0) {exit(4);}
                else if (pid[nb_current]==0)
                {    smem=Parallel(smem,pid,nbt,niter,statut,options,nb_current+2);       		                            
		      // child process 
		      // compute the niter/nbt part of niter hits 
                      smem[nb_current] = 0;			      
                      for (i=0; i<(niter/nbt); i++) 
                      { x = (double) rand() / RAND_MAX;
                        y = (double) rand() / RAND_MAX; 
	                z = x*x + y*y; 
	                if (z <= 1.0) {smem[nb_current]++; }
			}
                      printf("------smem in child-----nb_current=%d ---\n",nb_current);			
		      for(j=0;j<nbt;j++)
		      {printf("smem[%d]=%f\n",j,smem[j]);}		
                      exit(0);
                }
		else
		 {    // parent process 
		      // compute the niter/nbt part of niter hits 
                      smem[nb_current+1] = 0;		       			      
                      for (i=0; i<(niter/nbt); i++) 
                      { x = (double) rand() / RAND_MAX;
                        y = (double) rand() / RAND_MAX; 
	                z = x*x + y*y; 
	                if (z <= 1.0) {smem[nb_current+1]++; }
			}
		      waitpid(pid[nb_current],&statut,options);			                      
                      printf("------smem in parent-----nb_current+1=%d ---\n",nb_current+1);			
		      for(j=0;j<nbt;j++)
		      {printf("smem[%d]=%f\n",j,smem[j]);}			
 
 
		      return smem;
 
                 }
 
}
 
}  
 
 
 int main (int argc, char** argv) 
 
 { long int niter; // total number of hits 
   long int fd;long int i;
 
   int shm_id;
   int nb_process; // number of threads 
   int nb_current=0; // current index in Parallel recursive function
   double x,y,z;
   double pi=0;
//   struct sharedMem* smem;
 
   double* smem;
   struct timeval chrono1, chrono2; 
   int micro, second; pid_t *pid; 
   int pseudo_pid;
   int statut;
   int options = 0;   
 
 
  // parse the argument 
  if (argc != 3)
   { printf ("Please, specify number of threads and iterations \n"); 
     exit (1); } 
 
     nb_process = atoi(argv[1]);        
     niter = atol(argv[2]);   
 
 
  // create the shared memory 
 fd = open ("/tmp/myregion", O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
   if (fd == -1) exit(2);
 
    if (ftruncate (fd, nb_process*sizeof (double)) == -1) exit(3); 
 
    smem = mmap (NULL, nb_process*sizeof (double),
              PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
     if (smem == MAP_FAILED) exit(1); 
 
 
     // initialize pid 
 
     pid=(pid_t*)malloc(nb_process*sizeof(pid_t));
 
     // initialize time process
 
     gettimeofday(&chrono1, NULL);
 
     // monte carlo's method in parallel 
 
     smem=Parallel(smem,pid,nb_process,niter,statut,options,nb_current);
 
 
	gettimeofday(&chrono2, NULL); 
 
	close(fd); 
	// compute ellapsed time 
	micro = chrono2.tv_usec - chrono1.tv_usec; 
	if (micro < 0) 
	{ micro += 1000000; 
	second = chrono2.tv_sec - chrono1.tv_sec - 1; } 
	else 
	second = chrono2.tv_sec - chrono1.tv_sec;
	//printf("-------------------------\n");	
	for(i=0;i<nb_process;i++) 
	{//printf("smem[%d]=%f\n",i,smem[i]);
	 pi = pi+ smem[i];}
	 pi= pi/niter;
	 pi *= 4; 
 
	// Output
 
	printf("# of trials= %ld , estimate of pi is %1.8f \n",niter,pi);
	printf("%d second %d micro ellapsed\n", second, micro);
 
  /* Don't forget to free the mmapped memory
     */
    if (munmap(smem, nb_process*sizeof (double)) == -1) {
	printf("Error un-mmapping the file\n");
	/* Decide here whether to close(fd) and exit() or not. Depends... */
    }
 
    /* Un-mmaping doesn't close the file, so we still need to do that.
     */
    close(fd);
 
 
  return 0;
 
   }

Désormais, j'ai un autre problème intéressant. Prenons par exemple l'exécution avec 4 process avec un tirage de 100 millions :

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
 
$ ./a.out 4 100000000
------smem in child-----nb_current=2 ---
smem[0]=19633188.000000
smem[1]=19535673.000000
smem[2]=19637714.000000
smem[3]=19635183.000000
------smem in parent-----nb_current+1=3 ---
smem[0]=19633188.000000
smem[1]=19538898.000000
smem[2]=19637714.000000
smem[3]=19635183.000000
------smem in child-----nb_current=0 ---
smem[0]=19632508.000000
smem[1]=19635040.000000
smem[2]=19637714.000000
smem[3]=19635183.000000
------smem in parent-----nb_current+1=1 ---
smem[0]=19632508.000000
smem[1]=19635040.000000
smem[2]=19637714.000000
smem[3]=19635183.000000
# of trials= 100000000 , estimate of pi is 3.14161780 
2 second 949336 micro ellapsed

nb_current représente l'appel récursif en cours, il est incrémenté de 2 par chaque appel à la fonction "Parallel". Pour le processus père et fils, je fais un printf des élements du tableau smem[nb_current] et smem[nb_current+1] que je suis en train de remplir pour le "nb_current" ième appel.

Le problème est que je suis obligé d'utiliser dans le processus père courant la fonction "waitpid" pour attendre que le processus fils désigné par "pid[nb_current]" ait fini ses opérations, cad de remplir le vecteur "smem[nb_current]".

voici en gras où est placé ce waitpid :

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23


}
		else
		 {    // parent process 
		      // compute the niter/nbt part of niter hits 
                      smem[nb_current+1] = 0;		       			      
                      for (i=0; i<(niter/nbt); i++) 
                      { x = (double) rand() / RAND_MAX;
                        y = (double) rand() / RAND_MAX; 
	                z = x*x + y*y; 
	                if (z <= 1.0) {smem[nb_current+1]++; }
			}
		      waitpid(pid[nb_current],&statut,options);			                      
                      printf("------smem in parent-----nb_current+1=%d ---\n",nb_current+1);			
		      for(j=0;j<nbt;j++)
		      {printf("smem[%d]=%f\n",j,smem[j]);}			
		     
                      
		      return smem;
		      
                 }

Pour 4 process, un "ps aux | grep .aout" ne me donne que 3 processus. car en fait, le processus pid[1] doit attendre que le pid[0] ait fini, le processus pid[3] doit attendre que le pid[2] ait fini, mais pour que pid[0] ait fini, il faut avant que pid[2] et pid[3] aient aussi fini.

J'ai donc 3 processus qui tournent à peu près simultanément (pour un tirage assez grand comme 100 millions) : pid[1], pid[2], pid[3],

puis dans un second temps pid[0] qui commence dès que pid[2] et pid[3] sont finis.

Si vous faites tourner le programme, vous pourrez voir l'ordre d'affichage des éléments smem[nb_current] qui reflète bien l'ordre des attentes entre les processus.

Si je me passe de la fonction "waitpid", alors c'est le gros bordel, le résultat est complètement faux.

Comment faire alors pour se passer de ce "waitpid bloquant" tout en préservant le partage de mémoire entre les différentes tâches, c'est à dire le remplissage des vecteurs smem[nb_current] et smem[nb_current+1] ? Ceci me permettrait d'avoir effectivement 4 processus qui exécutent leur tâche en même temps et non plus "3 et puis après un quatrième en dernier" .

Quand on garde le "waitpid", je crois qu'on qualifie de "synchrone" cette façon de faire ...

Toute idée est la bienvenue !

ps: j'ai vu au cours de mes recherches l'api java fork/join qui souligne ce problème

**olivieram** · 12/06/2011, 07h14

Bonjour,
Effectivement, tu parles de threads, mais ce ne sont que des processus (ce qui est mieux, tu ne pourrais pas lire mieux tes états avec des threads).
Moi, dans ton problème, la seule chose que je vois c'est que :

tu as un processus ancètre,
tu veux pour nb_threads=4, au maximum 4 processus crées
Je remarque que : avec le processus ancètre, cela fait déjà 1 processus.
avec le premier fork() tu crées un second processus et le père est l'ancètre.
avec le second fork() tu crées un troisième processus et le père est le processus fils du premier fork()

Il te manque un 3ème fork pour faire ton 4ème processus.

Problème code multi-threadé avec fonction récursive

C

Discussions similaires

Partager

Partager