1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
| function [Keep_these, P] = tail_killer(M, alpha)
% Purpose:
% remove erroneous data from a supposedly normally distributed series
%
% Algorithm:
% If Normal, the probability of the tail of the distribution (max an min values) follows the Gumbel distribution.
% a) The tail_killer function first estimates a robust mean and standard deviation (by lowering the weights of the tails in the calculus)
% b) From this, the largest probable absolute value is computed and the tails are cut accordingly to alpha.
% c) steps a) and b) are re-processed until no more tail is cut OR the remaining series is 5 elements or less.
%
% input arguments :
% M -> the data. M must be a vector (either a column vector or a row vector)
% alpha -> the accepted risk (probability) of killing a point in the real tail
% default value is 0.01. Very versatile. No apparent need to change it.
% Note1: small risk (~0.01) keeps larger tails.
% Large risk (~0.1) cuts the tail much shorter.
% Silly risk (>=0.5) makes wierd things (just try ^^ )
% Note2: this parameter is unsignificant for long series (>10000 values) because in that case, the expected tail is known with good accuracy.
%
% output arguments : Length of M, 1 column.
% Keep_these -> Logical vector. Core = M(Keep_these). Tail = M(~Keep_these).
% P -> Probability of the min (or max) of the distribution to be larger (or smaller) than the corresponding element in M. P
% always refers to the latest test.
%
% Requirements:
% Matlab 7.7 or later (might work on prior release but untested)
% Statistic toolbox
%
% Author
% Olivier Planchon, Olivier.Planchon#at#Gmail.com
% IRD www.ird.fr
% UMR LISAH http://www.umr-lisah.fr/
%
% Licence
% Free
% Please ackowledge this version when re-using the code.
%
% end note: Why did I wrote this code? (humorous)
% As many researchers who do experiments, I hate the "black swan" result which pollutes a very nice series of data.
% I disliked all the possible options, each one more than any other. A best-of below:
% 1/ I kill the black swan an not report it in my paper.
% 2/ I also kill the the black swan's neighbor, because then, my results get so nice that I could compete for the Noble
% 3/ I write a five-lines paragraph in my paper to explain why I do not care with this ugly black swan.
% 4/ I write a ten-lines paragraph in my paper to explain how interesting, but different, are the black swans from the white ones.
%
% All right. Now I use the Gumbel law and i just need a single line that no reviewer can understand in the details:
% 'erroneous data have been discarded according to the Gumble law with alpha risk = 1%' footnote: rise your hand if you have any question :-p )
%
% Conclusion:
% With this piece of code, not only you can risklessly put your hands dirty in the engine, but you will likely look clever when you explain the dirty job :-D
% ==========================================
% initialisation of outputs for consistency
Keep_these = [] ;
P = [] ;
% no argument -> launch the test
% missing alpha -> set to default value
if nargin==0
test_tail_killer ;
return
elseif nargin == 1
alpha = 0.01 ;
end
% Calcule l'écart type robuste du coeur de la distribution (la distribution privée de sa traine)
% Get the standard deviation of the core of the distribution (i.e. distribution without its tails)
[rstd, rmean] = robust_std(M) ;
n = length(M) ;
% trier, centrer et réduire M
% on ne s'intéresse qu'au cas symétrique (couper les deux queues). Mcr ~ 0...max(abs(M))
[Mcr, I] = sort(abs(M-rmean)./rstd) ;
% La probabilité expérimentale de ne pas être dépassé en valeur absolue dans l'échantillon. (de 0.5 à 0.995 pour n=100)
P = 0.5 : 1/2/n : 1-1/2/n ;
% Esp_max(n) est l'espérance du maximum de n nombres normaux (2.578 pour n=100).
Esp_max = norminv(P, 0, 1) ;
% Sigma_max(n) est l'écart type du maximum de n nombres normaux (0.3539 pour n=100)
Sigma_max = norminv(1-1./((1:n).*exp(1)),0,1) - Esp_max ;
% P(n) est la probabilité que n nombres normaux aient un maximum (en valeur absolue) inf ou égal à Mcm(n).
% Note : si P(n) est trop petit, il faut rejeter le nombre correspondant.
P = evcdf(-Mcr(:), -Esp_max(:), Sigma_max(:)) ;
% génère les sorties
core = (P >= alpha) ;
tail = (P <= alpha) ; % quand p (proba d'être sous le maximum dans la population) est trop faible, on rejete l'élément
Itail = sort(I(tail)) ; % et aussi : Icore = sort(I(core)) ;
% fprintf(1, '%d ', Itail) ;
% fprintf(1, '(') ;
% fprintf(1, '%f ', M(Itail)) ;
% fprintf(1, ')\n') ;
Keep_these = repmat(true, length(M), 1) ;
Keep_these(Itail) = false ;
if nargout >=2
P(I) = P ;
end
% recurse tail_killer on the remaining distribution
if ~isempty(Itail) && length(find(core)) > 5
if nargout >=2
% call tail_killer on the remainings
[kp, pr] = tail_killer(M(Keep_these), alpha) ;
% consolidate results
P(Keep_these) = pr ;
Keep_these(Keep_these) = kp ;
else
% same thing with only one output argument
kp = tail_killer(M(Keep_these), alpha) ;
Keep_these(Keep_these) = kp ;
end
end
function [rstd, rmean, N, P, I] = robust_std(N)
if nargin==0
[rstd, N, P] = test_robust_std ;
return
end
n = length(N) ;
[N, I] = sort(N) ;
R = (0.5:n-0.5)/n ;
P = norminv(R, 0, 1) ;
if any(size(N)~=size(P))
N=N' ;
end
% le poids d'un point est plus fort au centre
W = normpdf((R-0.5).*P(end).*2) ;
W = exp(W * (numel(W)/sum(W))) - 1 ;
% On résoud la droite P = a * N + b (droite de Henry, http://fr.wikipedia.org/wiki/Droite_de_Henry)
% sous condition de normalité,l'abscisse à P=0 est rmean et l'abscisse à P=1 est rmean + rstd
% d'où : rmean = -b/a, rstd = 1/a
[s, in] = wlinreg(P, N, W) ;
rstd = 1/s ;
rmean = -in/s ;
function [rstd, N, P] = test_robust_std
clc
n = 1000 ;
poln = 0.1 ;
polk = 10 ;
N1 = norminv((0.5:n-0.5)/n, 0, 1) ;
N2 = norminv((0.5:n*poln-0.5)/n/poln, 0, polk) ;
N = [N1(:) ; N2(:)] ;
[rstd, N, P] = robust_std(N) ;
function [slope, intercept] = wlinreg(Y, X, W)
if nargin == 1
X = 1:length(Y) ;
end
if nargin <= 2
W = ones(size(X)) ;
end
WX = W.*X ;
sWX = sum(WX) ;
% l'équation matricielle de la régression simple avec pondération
R = [sum(WX.*X), sWX ; sWX, sum(W)] \ [sum(WX .* Y) ; sum(W .* Y)] ;
slope = R(1) ;
intercept = R(2) ;
function test_tail_killer
clc
disp('Test 1.1: 10 normal points + 1 error')
clear
n = 10 ; % nombre de points dans la répartition correcte
alpha = 0.01 ; % risque de dégager à tort un élément de la vraie queue.
N1 = round(10 .* norminv((0.5:n-0.5)/n, 0, 1))./10 ;
N2 = [10] ;
N = [N2(:) ; N1(:)] ; % les mauvais en tête
test_n_display_tail_killer(N, alpha) ;
disp('Test 1.2: 20 normal points + 2 errors')
clear
n = 20 ; % nombre de points dans la répartition correcte
alpha = 0.01 ; % risque de dégager à tort un élément de la vraie queue.
N1 = round(10 .* norminv((0.5:n-0.5)/n, 0, 1))./10 ;
N2 = [10 11] ;
N = [N2(:) ; N1(:)] ; % les mauvais en tête
test_n_display_tail_killer(N, alpha) ;
disp('Test 1.3: 30 normal points + 3 errors')
clear
n = 30 ; % nombre de points dans la répartition correcte
alpha = 0.01 ; % risque de dégager à tort un élément de la vraie queue.
N1 = round(10 .* norminv((0.5:n-0.5)/n, 0, 1))./10 ;
N2 = [10 11 7] ;
N = [N2(:) ; N1(:)] ; % les mauvais en tête
test_n_display_tail_killer(N, alpha) ;
disp('Test 2: 50 normal points + 5 random points in another distribution')
clear
ngood = 50 ; % nombre de points dans la répartition N(0,1)
sgood = 1 ;
mgood = 1 ;
nbad = 5 ; % nombe de points erronés
sbad = 5 ; % taux d'erreur
mbad = -1 ;
alpha = 0.01 ; % risque de dégager à tort un élément de la vraie queue.
Ngood = norminv((0.5:ngood-0.5)/ngood, mgood, sgood) ;
Nbad = norminv((0.5:nbad-0.5)/nbad, mbad, sbad) ;
N = [Nbad(:) ; Ngood(:)] ;
test_n_display_tail_killer(N, alpha) ;
disp('Test 3: 1000 normal points + 5 large errors. Test sevaral (large) values of alpha')
clear
N = [10.^(5:-1:0) norminv(0.001:0.001:0.999, 0, 1)] ;
N = N + rand(1, length(N)) ;
for alpha = 0.1:0.2:0.6
test_n_display_tail_killer(N, alpha) ;
end
disp('Test 5: test on white noise (Obviously not normal. Use at your own risk!')
clear
ngood = 50 ; % nombre de points dans la répartition N(0,1)
sgood = 1 ;
mgood = 1 ;
nbad = 5 ; % nombe de points erronés
sbad = 5 ; % taux d'erreur
mbad = -1 ;
alpha = 0.01 ; % risque de dégager à tort un élément de la vraie queue.
Ngood = rand(ngood, 1) .* sgood + mgood ;
Nbad = rand(nbad, 1) .* sbad + mbad ;
N = [Nbad(:) ; Ngood(:)] ;
test_n_display_tail_killer(N, alpha) ;
function test_n_display_tail_killer(M, alpha)
[Keep_these, P] = tail_killer(M, alpha) ;
% Plot preparation
[Ms, ix] = sort(M) ;
Ps = norminv((((1:length(M))-0.5)./length(M)), 0, 1) ;
Ps(ix) = Ps ;
% gaussian plot of input
try close(1) ;
catch %#ok<CTCH>
end
figure(1)
subplot(3,1,1)
hold on
plot(M, Ps, '.r') ;
plot(M(Keep_these), Ps(Keep_these), '.b') ;
grid on
ylabel ('Gaussian coordinate (sigma)')
xlabel ('Data unit')
title ('Distribution before and after tail killing')
subplot(3,1,2)
hold on
plot(M(Keep_these), Ps(Keep_these), '.b') ;
grid on
ylabel ('Gaussian coordinate (sigma)')
xlabel ('Data unit')
title ('Distribution after tail killing')
% probability plot, normal
subplot(3,2,5)
plot(M, P, '.r')
hold on
plot(M(Keep_these), P(Keep_these), '.b')
grid on
ylabel ('P')
xlabel ('Data unit')
u = xlim ;
plot(u(:), [alpha, alpha], 'r')
xlim(u) ;
% probability plot, semilog
subplot(3,2,6)
semilogy(M, P, '.r')
hold on
semilogy(M(Keep_these), P(Keep_these), '.b')
grid on
ylabel ('P')
xlabel ('Data unit')
u = xlim ;
plot(u(:), [alpha, alpha], 'r')
xlim(u) ;
fprintf(1, 'clik in the figure to continue\n') ;
ginput(1) |
Partager