1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
|
#!/usr/bin/perl
#Print errors during the implementation of the perl program
use strict; use POSIX;
#C core to increase the efficiency of the comparison program
#This program takes 3 arguments (2 strings and one threshold: respectively str_a, str_b and max). This
#program will then compare the two strings, and if the number of differences is higher than max, then
#the program returns the number of differences and stop, or returns the real number of differences.
use Inline C => <<'END_OF_C';
int c_compare( SV* str_a, SV* str_b, int max ) {
int diff = 0;
STRLEN len_a, len_b, len, i;
char *a, *b;
a = SvPV( str_a, len_a );
b = SvPV( str_b, len_b );
len = len_a < len_b ? len_a : len_b;
for( i = 0; i < len; i++ ) {
if( a[i] != b[i] ) diff++;
if(diff > max) break;
}
return diff;
}
END_OF_C
#Defined variables
my @childs = (); my $pid = 0;
my @childs2 = (); my $pid2 = 0;
my $file = "";
my $tmp = "";
my $dist_cutoff = 0;
my $dist_cutoff_stored = 0;
my $dist_cutoff_step = 0;
my $dist_cutoff_step_stored = 0;
my $mean_length = 0;
my $i, my $j, my $k, my $h, my $x= 0,0,0,0,0;
my $nb_char_treated = 0;
my $nb_treated_lines = 0;
my $max_abund = 0;
my $max_abund_pos = 0;
my $nb_seqs_aligned_class = 0;
my $exchange = "";
my $count = 0;
my $count_diff = 0;
my $column_analysis = "";
my $descr_line = "";
my $char1 = "";
my @tmp, my @tmp2 = (),();
my @tab_descr = ();
my @tab_descr_occur = ();
my @tab_align_seq = ();
my @tab_align_seq_clean = ();
my $nb_seq_tot = 0;
my $nb_seq_tot_derep = 0;
my @seqs_already_clustered = ();
my @seq1 = ();
my @seq2 = ();
my $seq1 = "";
my $seq2 = "";
my @defined_clusters = ();
my $nb_clusters = 0;
my $length_seq1, my $length_seq2 = 0,0;
#Asking for the desired maximum dissimilarity percentage tolerated for clustering
print ("Please define the maximum percentage dissimilarity threshold for clustering\n (percent) ?\n");
#Deleting the last character of the string and transforming it in integer,
chomp($dist_cutoff = <STDIN>);
$dist_cutoff = 0 + $dist_cutoff;
$dist_cutoff_stored = $dist_cutoff;
#Asking for the step size between 0 and the defined threshold
print ("Please define the step size percentage for clustering (percent) ?\n");
#Deleting the last character of the string and transforming it in integer
chomp($dist_cutoff_step = <STDIN>);
$dist_cutoff_step = 0 + $dist_cutoff_step;
$dist_cutoff_step_stored = $dist_cutoff_step;
#=======================================================================================================
#First step, store all descriptive lines and corresponding sequences in two tables: @tab_descr and
#@tab_align_seq for further analysis.
#=======================================================================================================
#Open the directory containing all files needed to be treated
opendir (DIR, "IN/") || die "can't opendir IN: $!";
#For each file found in the directory IN/
BOUCLE:while (defined($file = readdir(DIR)))
{
#No treatment for files '.' and '..'
if ($file =~ /^\.\.?$/){next BOUCLE;}
else
{
$pid = fork();
#If the program is the parent! (only one)
if ($pid) {push(@childs, $pid);}
#If the program is a child (max 10)
elsif ($pid == 0)
{
#Open file needed to be treated
open(F1, "IN/$file")|| die "==> File Not Found: $file<==";
#Needed empty variables for each step (several files)
$mean_length = 0;
$nb_seq_tot_derep = 0;
$nb_seq_tot = 0;
@tab_descr = ();
@tab_align_seq = ();
@tmp = ();
#For every line of the file given by the user
BOUCLE1:while (<F1>)
{
#If the line contains the character : ">"
if ($_ =~ s/>//)
{
#We do not keep the last line of the alignment, corresponding to the template
if ($_ =~ m/#=GC_RF/) {next BOUCLE1;}
else
{
#Split the line to store it in @tmp based on the '_' character
@tmp = split("_",$_);
#Count the number of sequences (not dereplicated to give in results file)
$nb_seq_tot_derep += $tmp[1];
#Store in $tab_descr_occur the number of occurences for each sequence for further sorting
$tab_descr_occur[$nb_seq_tot]=$tmp[1];
#Determine the mean length for sequences using dereplicated sequences
$tmp[2] =~ /length=(\d*)/;
$mean_length += $1*$tmp[1];
#Store the descriptive line in @tab_descr and increment the needed value $nb_seq_tot
#after deletion of potential spaces or \n
$_=~ s/\s//g;
$tab_descr[$nb_seq_tot++] = $_;
}
}
#If the line is the sequence itself
else
{
#We not keep the last line of the alignment, corresponding to the template
if ($_ =~m/x/) {next BOUCLE1;}
$tab_align_seq[$nb_seq_tot-1] = $_;
}
}
#We close the file as it is not needed anymore
close F1;
#Computation of the mean length to determine the number of tolerated differences
$mean_length = int($mean_length/$nb_seq_tot_derep+0.5);
#Determine the number of tolerated differences to cluster sequences and the step size
$dist_cutoff = ceil($mean_length*$dist_cutoff_stored/100);
$dist_cutoff_step = floor($mean_length*$dist_cutoff_step_stored/100);
#We cleared out the @tmp table to reuse it
@tmp = ();
#===================================================================================================
#Second step, reorganize sequences and descriptive lines based on their decreased abundance.
#===================================================================================================
#Needed empty variables for next step
@tmp = ();
$nb_seqs_aligned_class = 0;
#Loop to read all sequences one by one
BOUCLE2:for ($i=0;$i<$nb_seq_tot;$i++)
{
#Definition of a new table containing three elements, the number of occurences, the descriptive
#line and the corresponding sequence
@tmp[$i] = [$tab_descr_occur[$i],$tab_descr[$i],$tab_align_seq[$i]];
}
#Reorganizing the table @tmp defined previouly by decreasing abundance based on the occurences
@tmp2 = reverse sort { $a->[0] <=> $b->[0] } @tmp;
#Needed empty variables
@tmp = ();
@tab_descr = ();
@tab_align_seq = ();
#Open the temporary file in the Tmp/ directory
#Loop to recreate reorganized tables
BOUCLE3:for ($i=0;$i<$nb_seq_tot;$i++)
{
$tab_descr[$i] = $tmp2[$i][1];
$tab_align_seq[$i] = $tmp2[$i][2];
}
#Opening needed files to to store results (details and summary)
$file =~ /(\w*)\./;
open (FILE_R_DETAILS, ">OUT/$1_clust.clust");
open (FILE_R, ">OUT/$1_clust.txt");
#Print needed data in files for further steps
printf (FILE_R_DETAILS "File(s):\t$file\nSequences:\t$nb_seq_tot_derep \n\n");
printf (FILE_R "Total sequences\t$nb_seq_tot_derep \n\nCutoff\tClusters(OTUs)\n");
#Close files to give them for child processes
close (FILE_R_DETAILS);
close (FILE_R);
#Needed empty variables for next step
@tab_align_seq_clean = ();
@tmp = ();
$count = 0;
$nb_treated_lines = 0;
$column_analysis = "";
BOUCLE10:for($h=$dist_cutoff_step;$h<=$dist_cutoff;$h=$h+$dist_cutoff_step)
{
#Empty variable needed to treat several clustering for one file
$descr_line = "";
@defined_clusters = ();
$nb_clusters = 0;
$pid2 = fork();
#If the program is the parent! (only one)
if ($pid2) {push(@childs2, $pid2);}
#If the program is a child
elsif ($pid2 == 0)
{
#Definition of a table to determine if each sequence is already clustered (1) or not (0) to
#increase the speed analysis.
$i = 0;
BOUCLE6:foreach $tmp (@tab_descr)
{
$seqs_already_clustered[$i] = 0;
$i++;
}
#Loop to test each sequence one after one
BOUCLE7:for ($i=0;$i<$nb_seq_tot;$i++)
{
#Test to verify if this sequence is already clustered or not based on the table defined
#previously and named @seqs_already_clustered (1, already clustered)
if ($seqs_already_clustered[$i]==1) {next BOUCLE7;}
#$seqs_already_clustered = 0, need to be clustered
else
{
#Store the descriptive line for further treatment and creation of result files.
$descr_line = $tab_descr[$i];
#As this sequence is used to define the cluster, we put 1 in the right position of
#the @seqs_already_clustered table
$seqs_already_clustered[$i] = 1;
#For each treated sequence compared to the first one chosen by BOUCLE7
BOUCLE8:for ($j=1+$i;$j<$nb_seq_tot;$j++)
{
#Verifying if this sequence is already clustered (1), then go to the next one
if ($seqs_already_clustered[$j]==1) {next BOUCLE8;}
#If this sequence is not clustered yet (0)
else
{
#Determine using the C core at the top of the program the number of differences
#between two sequences passed as input
$count_diff = c_compare($tab_align_seq[$i], $tab_align_seq[$j], $h);
#If the two sequences are sufficiently similar, then we clustered them
if ($count_diff <= $h)
{
#We indicate in the table @seqs_already_clustered the clustering to not treat
#anymore this sequence and add its descriptive line in $descr_line
$seqs_already_clustered[$j] = 1;
$descr_line = $descr_line." ".$tab_descr[$j];
}
}
}
#At the end of the comparison between one sequence chosen by BOUCLE7 and all others, we create a
#new cluster in the table @defined_clusters and we increment the number of clusters ($nb_clusters)
$defined_clusters[$nb_clusters] = $descr_line;
$nb_clusters++;
}
}
#===================================================================================================
#Last step, store results in two new result files. The first giving details of clusters for each
#step of clustering (max and step size defined by the user). The second file gives only the number
#of clusters for each step. All results are given with the taking account for the number of
#dereplicated sequences.
#===================================================================================================
#Calculation of each step size to determine each clustering
$j = ceil($h/$mean_length*1000);
$j = $j/1000;
#Print only three numbers after the '.'
$j = sprintf ("%0.3f", $j);
open (FILE_R_DETAILS, ">>OUT/$1_clust.clust");
flock(FILE_R_DETAILS,2);
open (FILE_R, ">>OUT/$1_clust.txt");
flock(FILE_R,2);
#Print needed data in result files
printf (FILE_R_DETAILS "distance cutoff:\t$j\nTotal Clusters:\t$nb_clusters\n");
printf (FILE_R "$j\t$nb_clusters\n");
#Loop to print each defined cluster
for ($i=0;$i<$nb_clusters;$i++)
{
#Split the cluster to determine the number of real sequences (not dereplicated for further analysis)
@tmp = split(" ",$defined_clusters[$i]);
#Needed empty variable to determine the real number of sequences (not dereplicated)
$j = 0;
#For each sequence in the cluster
foreach $char1 (@tmp)
{
#Split based on the '_' character to found the real number of dereplicated sequences
@tmp2 = split("_",$char1);
#Count the number of sequences
$j = $j+$tmp2[1];
}
#Print the cluster in the result file
printf(FILE_R_DETAILS "$i\t$file\t$j\t$defined_clusters[$i]\n");
}
printf (FILE_R_DETAILS "\n");
flock(FILE_R_DETAILS,8);
close (FILE_R_DETAILS);
flock(FILE_R,8);
close (FILE_R);
exit(0);
}
#If we can't fork (create parent and child processes, die the process)
else {die "couldnt fork: $!\n";}
}
#foreach (@childs2) {waitpid($_, 0);}
exit(0);
}
else {die "couldnt fork: $!\n";}
#The parent process wait for all child results
foreach (@childs) {waitpid($_, 0);}
}
}
closedir DIR; |
Partager